def fetch(self, skip_save=False): try: html = requests.get(self.story.story_permalink, headers=self.headers) original_text_doc = readability.Document(html.text, url=html.url, debug=settings.DEBUG) content = original_text_doc.summary(html_partial=True) except: content = None if content: if not skip_save: self.story.original_text_z = zlib.compress(content) self.story.save() logging.user( self.request, "~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (len(unicode(content)), self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)))) else: logging.user( self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % (len(zlib.decompress(self.story.story_content_z)))) return content
def fetch(self, skip_save=False, return_document=False): if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS): logging.user( self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned") return try: resp = self.fetch_request() except TimeoutError: logging.user( self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out") resp = None except requests.exceptions.TooManyRedirects: logging.user( self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects" ) resp = None if not resp: return try: text = resp.text except (LookupError, TypeError): text = resp.content # if self.debug: # logging.user(self.request, "~FBOriginal text's website: %s" % text) if resp.encoding and resp.encoding != 'utf-8': try: text = text.encode(resp.encoding) except (LookupError, UnicodeEncodeError): pass if text: text = text.replace( "\xc2\xa0", " " ) # Non-breaking space, is mangled when encoding is not utf-8 text = text.replace( "\u00a0", " " ) # Non-breaking space, is mangled when encoding is not utf-8 original_text_doc = readability.Document( text, url=resp.url, positive_keywords= "post, entry, postProp, article, postContent, postField") try: content = original_text_doc.summary(html_partial=True) except (readability.Unparseable, ParserError), e: logging.user( self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e) return
def fetch(self, skip_save=False, return_document=False): try: resp = self.fetch_request() except TimeoutError: logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out") resp = None except requests.exceptions.TooManyRedirects: logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects") resp = None if not resp: return try: text = resp.text except (LookupError, TypeError): text = resp.content charset_declared = 'charset' in resp.headers.get('content-type', "") if resp.encoding and resp.encoding != 'utf-8' and not charset_declared: try: text = text.encode(resp.encoding) except (LookupError, UnicodeEncodeError): pass original_text_doc = readability.Document(text, url=resp.url, debug=settings.DEBUG) try: content = original_text_doc.summary(html_partial=True) except readability.Unparseable: return try: title = original_text_doc.title() except TypeError: title = "" url = resp.url if content: if self.story and not skip_save: self.story.original_text_z = zlib.compress(content) try: self.story.save() except NotUniqueError: pass logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % ( len(unicode(content)), self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)) )), warn_color=False) else: logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % ( self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)) )), warn_color=False) if return_document: return dict(content=content, title=title, url=url, doc=original_text_doc) return content
def fetch(self, skip_save=False, return_document=False): if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS): logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned") return try: resp = self.fetch_request() except TimeoutError: logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out") resp = None except requests.exceptions.TooManyRedirects: logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects") resp = None if not resp: return text = resp.text original_text_doc = readability.Document(text, url=resp.url, debug=self.debug, positive_keywords=["postContent", "postField"]) try: content = original_text_doc.summary(html_partial=True) except readability.Unparseable: return try: title = original_text_doc.title() except TypeError: title = "" url = resp.url if content: if self.story and not skip_save: self.story.original_text_z = zlib.compress(smart_str(content)) try: self.story.save() except NotUniqueError: pass logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % ( len(content), self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)) )), warn_color=False) else: logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % ( self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)) )), warn_color=False) if return_document: return dict(content=content, title=title, url=url, doc=original_text_doc) return content
def fetch(self, skip_save=False): try: resp = self.fetch_request() except TimeoutError: logging.user( self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out") resp = None if not resp: return try: text = resp.text except (LookupError, TypeError): text = resp.content if resp.encoding and resp.encoding != 'utf-8': try: text = text.encode(resp.encoding) except (LookupError, UnicodeEncodeError): pass original_text_doc = readability.Document(text, url=resp.url, debug=settings.DEBUG) try: content = original_text_doc.summary(html_partial=True) except readability.Unparseable: return if content: if not skip_save: self.story.original_text_z = zlib.compress(content) try: self.story.save() except NotUniqueError: pass logging.user(self.request, ( "~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (len(unicode(content)), self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)))), warn_color=False) else: logging.user(self.request, ( "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % (self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)))), warn_color=False) return content
def fetch(self, skip_save=False, return_document=False): if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS): logging.user( self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned") return try: resp = self.fetch_request() except TimeoutError: logging.user( self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out") resp = None except requests.exceptions.TooManyRedirects: logging.user( self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects" ) resp = None if not resp: return text = resp.text original_text_doc = readability.Document( text, url=resp.url, debug=self.debug, positive_keywords=["postContent", "postField"]) try: content = original_text_doc.summary(html_partial=True) except (readability.Unparseable, ParserError), e: logging.user( self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e) return
self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e) logging.error('error fetch_request'+str(e)+\ # ' feed_id:'+str(self.story.story_feed_id)+\ ' stroy_link:'+str(self.story.story_permalink)) return finally: opener.close() if not text: logging.error('error fetch text: text is null') return #soup = BeautifulSoup(text) #text = soup.renderContents() try: original_text_doc = readability.Document( text, url=self.story.story_permalink) content = original_text_doc.summary(html_partial=True) print "the length of content: %s" % len(content) #content = content.encode("utf-8") except readability.Unparseable, e: logging.error('error getting summary: '+str(e)+\ # ' feed_id:'+str(self.story.story_feed_id)+\ ' stroy_link:'+str(self.story.story_permalink)) # if settings.SEND_ERROR_MAILS: # mail_admins("Error in text_importer Build Document",str(e)+\ # ' feed_id:'+str(self.story.story_feed_id)+\ # ' stroy_link:'+str(self.story.story_permalink)+\ # traceback.format_exc()) return if len(content) < 60:
text = opener.open(request).read() print 'hi' print len(text) if text[:6] == '\x1f\x8b\x08\x00\x00\x00': print 'GZIP' text = gzip.GzipFile(fileobj = cStringIO.StringIO(text)).read() # a=text.decode('gb2312') # print a[0:20] except httplib.IncompleteRead as e: text = e.partial # soup = BeautifulSoup(text) # text = soup.renderContents() if not text: print 'no text!' doc = readability.Document(text) content = doc.summary(html_partial=True) print content # httplib.HTTPConnection._http_vsn= 10 # httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' # proxies = {'http':'http://127.0.0.1:7777','https':'http://127.0.0.1:7777'} # r = requests.get(url,proxies=proxies) # print r.encoding # t = r.text # print type(t) # d = readability.Document(t) # c = d.summary(html_partial=True) # c.encode(r.encoding) # print type(c)