Beispiel #1
0
    def __init__(self, output_dir, **config):
        self.output_dir = output_dir

        self.config = config

        username = self.config['username']
        password = self.config['password']

        if not password:
            password = getpass.getpass(
                "please input your google reader's password:"******"reader's username or password is empty!")

        auth = ClientAuthMethod(username, password)
        self.reader = RssReader(auth)
        self.user_info = self.reader.getUserInfo()
Beispiel #2
0
class Reader(object):
    """docstring for KindleReader"""
    output_dir = None
    config = None
    remove_tags = ['script', 'object', 'video',
                   'embed', 'iframe', 'noscript', 'style']
    remove_attributes = ['class', 'id', 'title',
                         'style', 'width', 'height', 'onclick']
    max_image_number = 0
    user_agent = "kindlereader"

    def __init__(self, output_dir, **config):
        self.output_dir = output_dir

        self.config = config

        username = self.config['username']
        password = self.config['password']

        if not password:
            password = getpass.getpass(
                "please input your google reader's password:"******"reader's username or password is empty!")

        auth = ClientAuthMethod(username, password)
        self.reader = RssReader(auth)
        self.user_info = self.reader.getUserInfo()

    def is_url_blocked(self, url):
        if(url.find("feedsportal.com") >= 0 or
           url.find("feedsky.com") >= 0 or
           url.startswith("http://union.vancl.com/") or
           url.startswith("http://www.inoreader.com/adv")):
            return True
        else:
            return False

    def parse_content(self, content, ref):
        """处理文章"""

        soup = BeautifulSoup(content)

        for span in list(soup.findAll(attrs={"style": "display: none;"})):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr: True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        images = []
        for img in list(soup.findAll('img')):
            if ((self.max_image_number >= 0 and
                img_count >= self.max_image_number) or
                img.has_key('src') is False or
                self.is_url_blocked(img['src'])):
                img.extract()
            else:
                if len(img['src']) > 2048:
                    logging.warning("img src is too long")
                    img.extract()
                else:
                    try:
                        output_dir = self.output_dir
                        localimage, fullname = ImageDownloadManager.parse_image(
                            img['src'], ref, output_dir)
                        if os.path.isfile(fullname) is False:
                            images.append({
                                'url': img['src'],
                                'filename': fullname
                            })

                        if localimage:
                            img['src'] = localimage
                            img_count = img_count + 1
                        else:
                            img.extract()
                    except Exception, e:
                        logging.info("error: %s" % e)
                        img.extract()

        return soup.renderContents('utf-8'), images