Python Documentの例、vendor.readability.readability.Document Pythonの例

コード例 #1

0

ファイルを表示

    def fetch(self, skip_save=False):
        try:
            html = requests.get(self.story.story_permalink,
                                headers=self.headers)
            original_text_doc = readability.Document(html.text,
                                                     url=html.url,
                                                     debug=settings.DEBUG)
            content = original_text_doc.summary(html_partial=True)
        except:
            content = None

        if content:
            if not skip_save:
                self.story.original_text_z = zlib.compress(content)
                self.story.save()
            logging.user(
                self.request,
                "~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes"
                % (len(unicode(content)), self.story.story_content_z
                   and len(zlib.decompress(self.story.story_content_z))))
        else:
            logging.user(
                self.request,
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes"
                % (len(zlib.decompress(self.story.story_content_z))))

        return content

コード例 #2

0

ファイルを表示

    def fetch(self, skip_save=False, return_document=False):
        if self.story_url and any(broken_url in self.story_url
                                  for broken_url in BROKEN_URLS):
            logging.user(
                self.request,
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned")
            return

        try:
            resp = self.fetch_request()
        except TimeoutError:
            logging.user(
                self.request,
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
            resp = None
        except requests.exceptions.TooManyRedirects:
            logging.user(
                self.request,
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects"
            )
            resp = None

        if not resp:
            return

        try:
            text = resp.text
        except (LookupError, TypeError):
            text = resp.content

        # if self.debug:
        #     logging.user(self.request, "~FBOriginal text's website: %s" % text)

        if resp.encoding and resp.encoding != 'utf-8':
            try:
                text = text.encode(resp.encoding)
            except (LookupError, UnicodeEncodeError):
                pass

        if text:
            text = text.replace(
                "\xc2\xa0", " "
            )  # Non-breaking space, is mangled when encoding is not utf-8
            text = text.replace(
                "\u00a0", " "
            )  # Non-breaking space, is mangled when encoding is not utf-8

        original_text_doc = readability.Document(
            text,
            url=resp.url,
            positive_keywords=
            "post, entry, postProp, article, postContent, postField")
        try:
            content = original_text_doc.summary(html_partial=True)
        except (readability.Unparseable, ParserError), e:
            logging.user(
                self.request,
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
            return

コード例 #3

0

ファイルを表示

    def fetch(self, skip_save=False, return_document=False):
        try:
            resp = self.fetch_request()
        except TimeoutError:
            logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
            resp = None
        except requests.exceptions.TooManyRedirects:
            logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
            resp = None
        
        if not resp:
            return
        
        try:
            text = resp.text
        except (LookupError, TypeError):
            text = resp.content
        
        charset_declared = 'charset' in resp.headers.get('content-type', "")
        if resp.encoding and resp.encoding != 'utf-8' and not charset_declared:
            try:
                text = text.encode(resp.encoding)
            except (LookupError, UnicodeEncodeError):
                pass
        original_text_doc = readability.Document(text, url=resp.url, debug=settings.DEBUG)
        try:
            content = original_text_doc.summary(html_partial=True)
        except readability.Unparseable:
            return
        
        try:
            title = original_text_doc.title()
        except TypeError:
            title = ""
        url = resp.url
        
        if content:
            if self.story and not skip_save:
                self.story.original_text_z = zlib.compress(content)
                try:
                    self.story.save()
                except NotUniqueError:
                    pass
            logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (
                len(unicode(content)),
                self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
            )), warn_color=False)
        else:
            logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % (
                self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
            )), warn_color=False)
        
        if return_document:
            return dict(content=content, title=title, url=url, doc=original_text_doc)

        return content

コード例 #4

0

ファイルを表示

ファイル: text_importer.py プロジェクト: stfenjobs/PyTune3

    def fetch(self, skip_save=False, return_document=False):
        if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS):
            logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned")
            return

        try:
            resp = self.fetch_request()
        except TimeoutError:
            logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
            resp = None
        except requests.exceptions.TooManyRedirects:
            logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
            resp = None

        if not resp:
            return

        text = resp.text
        original_text_doc = readability.Document(text, url=resp.url,
                                                 debug=self.debug,
                                                 positive_keywords=["postContent", "postField"])
        try:
            content = original_text_doc.summary(html_partial=True)
        except readability.Unparseable:
            return

        try:
            title = original_text_doc.title()
        except TypeError:
            title = ""
        url = resp.url

        if content:
            if self.story and not skip_save:
                self.story.original_text_z = zlib.compress(smart_str(content))
                try:
                    self.story.save()
                except NotUniqueError:
                    pass
            logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (
                len(content),
                self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
            )), warn_color=False)
        else:
            logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % (
                self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
            )), warn_color=False)

        if return_document:
            return dict(content=content, title=title, url=url, doc=original_text_doc)

        return content

コード例 #5

0

ファイルを表示

ファイル: text_importer.py プロジェクト: vjsai/NewsBlur

    def fetch(self, skip_save=False):
        try:
            resp = self.fetch_request()
        except TimeoutError:
            logging.user(
                self.request,
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
            resp = None

        if not resp:
            return

        try:
            text = resp.text
        except (LookupError, TypeError):
            text = resp.content

        if resp.encoding and resp.encoding != 'utf-8':
            try:
                text = text.encode(resp.encoding)
            except (LookupError, UnicodeEncodeError):
                pass
        original_text_doc = readability.Document(text,
                                                 url=resp.url,
                                                 debug=settings.DEBUG)
        try:
            content = original_text_doc.summary(html_partial=True)
        except readability.Unparseable:
            return

        if content:
            if not skip_save:
                self.story.original_text_z = zlib.compress(content)
                try:
                    self.story.save()
                except NotUniqueError:
                    pass
            logging.user(self.request, (
                "~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes"
                % (len(unicode(content)), self.story.story_content_z
                   and len(zlib.decompress(self.story.story_content_z)))),
                         warn_color=False)
        else:
            logging.user(self.request, (
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes"
                % (self.story.story_content_z
                   and len(zlib.decompress(self.story.story_content_z)))),
                         warn_color=False)

        return content

コード例 #6

0

ファイルを表示

    def fetch(self, skip_save=False, return_document=False):
        if self.story_url and any(broken_url in self.story_url
                                  for broken_url in BROKEN_URLS):
            logging.user(
                self.request,
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned")
            return

        try:
            resp = self.fetch_request()
        except TimeoutError:
            logging.user(
                self.request,
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
            resp = None
        except requests.exceptions.TooManyRedirects:
            logging.user(
                self.request,
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects"
            )
            resp = None

        if not resp:
            return

        text = resp.text
        original_text_doc = readability.Document(
            text,
            url=resp.url,
            debug=self.debug,
            positive_keywords=["postContent", "postField"])
        try:
            content = original_text_doc.summary(html_partial=True)
        except (readability.Unparseable, ParserError), e:
            logging.user(
                self.request,
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
            return

コード例 #7

0

ファイルを表示

ファイル: text_importer.py プロジェクト: cash2one/rssEngine

                self.request,
                "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
            logging.error('error fetch_request'+str(e)+\
                # '  feed_id:'+str(self.story.story_feed_id)+\
                '  stroy_link:'+str(self.story.story_permalink))
            return
        finally:
            opener.close()

        if not text:
            logging.error('error fetch text: text is null')
            return
        #soup = BeautifulSoup(text)
        #text = soup.renderContents()
        try:
            original_text_doc = readability.Document(
                text, url=self.story.story_permalink)
            content = original_text_doc.summary(html_partial=True)
            print "the length of content: %s" % len(content)
            #content = content.encode("utf-8")
        except readability.Unparseable, e:
            logging.error('error getting summary: '+str(e)+\
                # '  feed_id:'+str(self.story.story_feed_id)+\
                '  stroy_link:'+str(self.story.story_permalink))
            # if settings.SEND_ERROR_MAILS:
            #     mail_admins("Error in text_importer Build Document",str(e)+\
            #         '  feed_id:'+str(self.story.story_feed_id)+\
            #         '  stroy_link:'+str(self.story.story_permalink)+\
            #         traceback.format_exc())
            return

        if len(content) < 60:

コード例 #8

0

ファイルを表示

ファイル: test_content_extraction.py プロジェクト: cash2one/rssEngine

  text = opener.open(request).read()
  print 'hi'
  print len(text)
  if text[:6] == '\x1f\x8b\x08\x00\x00\x00':
    print 'GZIP'
    text = gzip.GzipFile(fileobj = cStringIO.StringIO(text)).read()
  # a=text.decode('gb2312')
  # print a[0:20]
except httplib.IncompleteRead as e:
    text = e.partial
# soup = BeautifulSoup(text)
# text = soup.renderContents()
if not text:
  print 'no text!'

doc = readability.Document(text)
content = doc.summary(html_partial=True)
print content


# httplib.HTTPConnection._http_vsn= 10 
# httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
# proxies = {'http':'http://127.0.0.1:7777','https':'http://127.0.0.1:7777'}
# r = requests.get(url,proxies=proxies)
# print r.encoding
# t = r.text
# print type(t)
# d = readability.Document(t)
# c = d.summary(html_partial=True)
# c.encode(r.encoding)
# print type(c)