def __init__(self, output_dir, **config): self.output_dir = output_dir self.config = config username = self.config['username'] password = self.config['password'] if not password: password = getpass.getpass( "please input your google reader's password:"******"reader's username or password is empty!") auth = ClientAuthMethod(username, password) self.reader = RssReader(auth) self.user_info = self.reader.getUserInfo()
class Reader(object): """docstring for KindleReader""" output_dir = None config = None remove_tags = ['script', 'object', 'video', 'embed', 'iframe', 'noscript', 'style'] remove_attributes = ['class', 'id', 'title', 'style', 'width', 'height', 'onclick'] max_image_number = 0 user_agent = "kindlereader" def __init__(self, output_dir, **config): self.output_dir = output_dir self.config = config username = self.config['username'] password = self.config['password'] if not password: password = getpass.getpass( "please input your google reader's password:"******"reader's username or password is empty!") auth = ClientAuthMethod(username, password) self.reader = RssReader(auth) self.user_info = self.reader.getUserInfo() def is_url_blocked(self, url): if(url.find("feedsportal.com") >= 0 or url.find("feedsky.com") >= 0 or url.startswith("http://union.vancl.com/") or url.startswith("http://www.inoreader.com/adv")): return True else: return False def parse_content(self, content, ref): """处理文章""" soup = BeautifulSoup(content) for span in list(soup.findAll(attrs={"style": "display: none;"})): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr: True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 images = [] for img in list(soup.findAll('img')): if ((self.max_image_number >= 0 and img_count >= self.max_image_number) or img.has_key('src') is False or self.is_url_blocked(img['src'])): img.extract() else: if len(img['src']) > 2048: logging.warning("img src is too long") img.extract() else: try: output_dir = self.output_dir localimage, fullname = ImageDownloadManager.parse_image( img['src'], ref, output_dir) if os.path.isfile(fullname) is False: images.append({ 'url': img['src'], 'filename': fullname }) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() except Exception, e: logging.info("error: %s" % e) img.extract() return soup.renderContents('utf-8'), images