Python BeautifulSoup.findAll 예제들, calibre.ebooks.BeautifulSoup.BeautifulSoup.findAll Python 예제들

예제 #1

0

파일 보기

파일: __init__.py 프로젝트: naisonsouza/calibre

def extract_calibre_cover(raw, base, log):
    from calibre.ebooks.BeautifulSoup import BeautifulSoup

    soup = BeautifulSoup(raw)
    matches = soup.find(name=["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "font", "br"])
    images = soup.findAll("img")
    if matches is None and len(images) == 1 and images[0].get("alt", "") == "cover":
        img = images[0]
        img = os.path.join(base, *img["src"].split("/"))
        if os.path.exists(img):
            return open(img, "rb").read()

    # Look for a simple cover, i.e. a body with no text and only one <img> tag
    if matches is None:
        body = soup.find("body")
        if body is not None:
            text = u"".join(map(unicode, body.findAll(text=True)))
            if text.strip():
                # Body has text, abort
                return
            images = body.findAll("img", src=True)
            if 0 < len(images) < 2:
                img = os.path.join(base, *images[0]["src"].split("/"))
                if os.path.exists(img):
                    return open(img, "rb").read()

예제 #2

0

파일 보기

    def search_for_asin_on_amazon(self, query):
        '''Search for book's asin on amazon using given query'''
        query = urlencode({'keywords': query})
        url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[
            9:] + '&' + query
        try:
            response = open_url(self._connections['amazon'], url)
        except PageDoesNotExist:
            return None

        # check to make sure there are results
        if ('did not match any products' in response
                and 'Did you mean:' not in response
                and 'so we searched in All Departments' not in response):
            return None

        soup = BeautifulSoup(response)
        results = soup.findAll('div', {'class': 's-result-list'})

        if not results:
            return None

        for result in results:
            if 'Buy now with 1-Click' in str(result):
                asin_search = AMAZON_ASIN_PAT.search(str(result))
                if asin_search:
                    return asin_search.group(1)

        return None

예제 #3

0

파일 보기

파일: config.py 프로젝트: DuskyRose/calibre-marvin-manager

    def get_annotations_date_range(self):
        '''
        Find oldest, newest annotation in annotated books
        initial values of self.oldest, self.newest are reversed to allow update comparisons
        if no annotations, restore to correct values
        '''
        annotations_found = False

        for cid in self.annotation_map:
            mi = self.cdb.get_metadata(cid, index_is_id=True)
            if self.field == 'Comments':
                soup = BeautifulSoup(mi.comments)
            else:
                soup = BeautifulSoup(mi.get_user_metadata(self.field, False)['#value#'])

            uas = soup.findAll('div', 'annotation')
            for ua in uas:
                annotations_found = True
                timestamp = float(ua.find('td', 'timestamp')['uts'])
                if timestamp < self.oldest_annotation:
                    self.oldest_annotation = timestamp
                if timestamp > self.newest_annotation:
                    self.newest_annotation = timestamp

        if not annotations_found:
            temp = self.newest_annotation
            self.newest_annotation = self.oldest_annotation
            self.oldest_annotation = temp

예제 #4

0

파일 보기

파일: config.py 프로젝트: kbw1/calibre-marvin-manager

    def get_annotations_date_range(self):
        '''
        Find oldest, newest annotation in annotated books
        initial values of self.oldest, self.newest are reversed to allow update comparisons
        if no annotations, restore to correct values
        '''
        annotations_found = False

        for cid in self.annotation_map:
            mi = self.cdb.get_metadata(cid, index_is_id=True)
            if self.field == 'Comments':
                soup = BeautifulSoup(mi.comments)
            else:
                soup = BeautifulSoup(
                    mi.get_user_metadata(self.field, False)['#value#'])

            uas = soup.findAll('div', 'annotation')
            for ua in uas:
                annotations_found = True
                timestamp = float(ua.find('td', 'timestamp')['uts'])
                if timestamp < self.oldest_annotation:
                    self.oldest_annotation = timestamp
                if timestamp > self.newest_annotation:
                    self.newest_annotation = timestamp

        if not annotations_found:
            temp = self.newest_annotation
            self.newest_annotation = self.oldest_annotation
            self.oldest_annotation = temp

예제 #5

0

파일 보기

def extract_calibre_cover(raw, base, log):
    from calibre.ebooks.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(raw)
    matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
        'font', 'br'])
    images = soup.findAll('img')
    if matches is None and len(images) == 1 and \
            images[0].get('alt', '')=='cover':
        img = images[0]
        img = os.path.join(base, *img['src'].split('/'))
        if os.path.exists(img):
            return open(img, 'rb').read()

    # Look for a simple cover, i.e. a body with no text and only one <img> tag
    if matches is None:
        body = soup.find('body')
        if body is not None:
            text = u''.join(map(unicode, body.findAll(text=True)))
            if text.strip():
                # Body has text, abort
                return
            images = body.findAll('img', src=True)
            if 0 < len(images) < 2:
                img = os.path.join(base, *images[0]['src'].split('/'))
                if os.path.exists(img):
                    return open(img, 'rb').read()

예제 #6

0

파일 보기

파일: simple.py 프로젝트: zyhong/calibre

    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        soup = BeautifulSoup(usrc)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace,
                                     self.verbose,
                                     strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = BeautifulSoup(replace)

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(
                self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(
                self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)

예제 #7

0

파일 보기

파일: book_settings.py 프로젝트: szarroug3/X-Ray_Calibre_Plugin

    def search_for_asin_on_amazon(self, query):
        '''Search for book's asin on amazon using given query'''
        query = urlencode({'keywords': query})
        url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query
        try:
            response = open_url(self._connections['amazon'], url)
        except PageDoesNotExist:
            return None

        # check to make sure there are results
        if ('did not match any products' in response and 'Did you mean:' not in response and
                'so we searched in All Departments' not in response):
            return None

        soup = BeautifulSoup(response)
        results = soup.findAll('div', {'id': 'resultsCol'})

        if not results:
            return None

        for result in results:
            if 'Buy now with 1-Click' in str(result):
                asin_search = AMAZON_ASIN_PAT.search(str(result))
                if asin_search:
                    return asin_search.group(1)

        return None

예제 #8

0

파일 보기

파일: __init__.py 프로젝트: yeyanchao/calibre

def extract_calibre_cover(raw, base, log):
    from calibre.ebooks.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(raw)
    matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
        'font', 'br'])
    images = soup.findAll('img')
    if matches is None and len(images) == 1 and \
            images[0].get('alt', '')=='cover':
        img = images[0]
        img = os.path.join(base, *img['src'].split('/'))
        if os.path.exists(img):
            return open(img, 'rb').read()

    # Look for a simple cover, i.e. a body with no text and only one <img> tag
    if matches is None:
        body = soup.find('body')
        if body is not None:
            text = u''.join(map(unicode, body.findAll(text=True)))
            if text.strip():
                # Body has text, abort
                return
            images = body.findAll('img', src=True)
            if 0 < len(images) < 2:
                img = os.path.join(base, *images[0]['src'].split('/'))
                if os.path.exists(img):
                    return open(img, 'rb').read()

예제 #9

0

파일 보기

파일: simple.py 프로젝트: T30rGRB7/calibre-python3

    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        # Some websites have buggy doctype declarations that mess up beautifulsoup
        nmassage += [(re.compile(r'<!DOCTYPE .+?>',
                                 re.DOTALL | re.IGNORECASE), lambda m: '')]
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        soup = BeautifulSoup(usrc, markupMassage=nmassage)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            soup = BeautifulSoup(xml_to_unicode(replace,
                                                self.verbose,
                                                strip_encoding_pats=True)[0],
                                 markupMassage=nmassage)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(
                self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(
                self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)

예제 #10

0

파일 보기

파일: simple.py 프로젝트: JimmXinu/calibre

    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        soup = BeautifulSoup(usrc)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = BeautifulSoup(replace)

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)

예제 #11

0

파일 보기

파일: simple.py 프로젝트: rlugojr/calibre

    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        # Some websites have buggy doctype declarations that mess up beautifulsoup
        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        soup = BeautifulSoup(usrc, markupMassage=nmassage)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)

예제 #12

0

파일 보기

파일: book.py 프로젝트: gitter-badger/X-Ray_Calibre_Plugin

    def get_asin(self, connection):
        query = urlencode({'keywords': '%s - %s' % (self._title, self._author)})
        try:
            connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
            response = connection.getresponse().read()
        except:
            try:
                connection.close()
                if self._proxy:
                    connection = HTTPConnection(self._http_address, self._http_port)
                    connection.set_tunnel('www.amazon.com', 80)
                else:
                    connection = HTTPConnection('www.amazon.com')

                connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
                response = connection.getresponse().read()
            except:
                self._status = self.FAIL
                self._status_message = self.FAILED_COULD_NOT_CONNECT_TO_AMAZON
                raise Exception(self._status_message)

        # check to make sure there are results
        if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response:
            self._status = self.FAIL
            self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE
            raise Exception(self._status_message)

        soup = BeautifulSoup(response)
        results = soup.findAll('div', {'id': 'resultsCol'})
       
        if not results or len(results) == 0:
            self._status = self.FAIL
            self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE
            raise Exception(self._status_message)

        for r in results:
            if 'Buy now with 1-Click' in str(r):
                asinSearch = self.AMAZON_ASIN_PAT.search(str(r))
                if asinSearch:
                    self._asin = asinSearch.group(1)
                    mi = self._db.get_metadata(self._book_id)
                    identifiers = mi.get_identifiers()
                    identifiers['mobi-asin'] = self._asin
                    mi.set_identifiers(identifiers)
                    self._db.set_metadata(self._book_id, mi)
                    self._book_settings.prefs['asin'] = self._asin
                    return connection

        self._status = self.FAIL
        self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_ASIN
        raise Exception(self._status_message)

예제 #13

0

파일 보기

파일: annotations.py 프로젝트: kbw1/calibre-marvin-manager

def merge_annotations_with_comments(parent, cid, comments_soup, new_soup):
    '''
    comments_soup: comments potentially with user_annotations
    '''

    # Prepare a new COMMENTS_DIVIDER
    comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format(
        plugin_prefs.get(
            'COMMENTS_DIVIDER',
            '&middot;  &middot;  &bull;  &middot;  &#x2726;  &middot;  &bull;  &middot; &middot;'
        ))

    # Remove the old comments_divider
    cds = comments_soup.find('div', 'comments_divider')
    if cds:
        cds.extract()

    # Existing annotations?
    uas = comments_soup.find('div', 'user_annotations')
    if uas:
        # Save the existing annotations to old_soup
        old_soup = BeautifulSoup(unicode(uas))

        # Remove any hrs from old_soup
        hrs = old_soup.findAll('hr')
        if hrs:
            for hr in hrs:
                hr.extract()

        # Remove the existing annotations from comments_soup
        uas.extract()

        # Merge old_soup with new_soup
        merged_soup = unicode(comments_soup) + \
                      unicode(comments_divider) + \
                      unicode(merge_annotations(parent, cid, old_soup, new_soup))
    else:
        # No existing, just merge comments_soup with already sorted new_soup
        merged_soup = unicode(comments_soup) + \
                      unicode(comments_divider) + \
                      unicode(new_soup)

    return merged_soup

예제 #14

0

파일 보기

파일: annotations.py 프로젝트: mkarpiarz/calibre-annotations

def merge_annotations_with_comments(parent, cid, comments_soup, new_soup):
    '''
    comments_soup: comments potentially with user_annotations
    '''

    # Prepare a new COMMENTS_DIVIDER
    comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format(
        plugin_prefs.get('COMMENTS_DIVIDER', '&middot;  &middot;  &bull;  &middot;  &#x2726;  &middot;  &bull;  &middot; &middot;'))

    # Remove the old comments_divider
    cds = comments_soup.find('div', 'comments_divider')
    if cds:
        cds.extract()

    # Existing annotations?
    uas = comments_soup.find('div', 'user_annotations')
    if uas:
        # Save the existing annotations to old_soup
        old_soup = BeautifulSoup(unicode(uas))

        # Remove any hrs from old_soup
        hrs = old_soup.findAll('hr')
        if hrs:
            for hr in hrs:
                hr.extract()

        # Remove the existing annotations from comments_soup
        uas.extract()

        # Merge old_soup with new_soup
        merged_soup = unicode(comments_soup) + \
                      unicode(comments_divider) + \
                      unicode(merge_annotations(parent, cid, old_soup, new_soup))
    else:
        # No existing, just merge comments_soup with already sorted new_soup
        merged_soup = unicode(comments_soup) + \
                      unicode(comments_divider) + \
                      unicode(new_soup)

    return merged_soup

예제 #15

0

파일 보기

파일: book_settings.py 프로젝트: gitter-badger/X-Ray_Calibre_Plugin

    def get_asin(self):
        query = urlencode({'keywords': '%s' % self.title_and_author})
        try:
            self._aConnection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
            response = self._aConnection.getresponse().read()
        except:
            try:
                self._aConnection.close()
                if self._proxy:
                    self._aConnection = HTTPConnection(self._http_address, self._http_port)
                    self._aConnection.set_tunnel('www.amazon.com', 80)
                else:
                    self._aConnection = HTTPConnection('www.amazon.com')

                self._aConnection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
                response = self._aConnection.getresponse().read()
            except:
                return None

        # check to make sure there are results
        if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response:
            return None

        soup = BeautifulSoup(response)
        results = soup.findAll('div', {'id': 'resultsCol'})
       
        if not results or len(results) == 0:
            return None

        for r in results:
            if 'Buy now with 1-Click' in str(r):
                asinSearch = self.AMAZON_ASIN_PAT.search(str(r))
                if asinSearch:
                    asin = asinSearch.group(1)
                    mi = self._db.get_metadata(self._book_id)
                    identifiers = mi.get_identifiers()
                    identifiers['mobi-asin'] = asin
                    mi.set_identifiers(identifiers)
                    self._db.set_metadata(self._book_id, mi)
                    return asin

예제 #16

0

파일 보기

파일: toc.py 프로젝트: Eskibear/kindle-feeder-rails

    def read_html_toc(self, toc):
        self.base_path = os.path.dirname(toc)
        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
        for a in soup.findAll('a'):
            if not a.has_key('href'):
                continue
            purl = urlparse(unquote(a['href']))
            href, fragment = purl[2], purl[5]
            if not fragment:
                fragment = None
            else:
                fragment = fragment.strip()
            href = href.strip()

            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
            add = True
            for i in self.flat():
                if i.href == href and i.fragment == fragment:
                    add = False
                    break
            if add:
                self.add_item(href, fragment, txt)

예제 #17

0

파일 보기

파일: toc.py 프로젝트: JapaChin/calibre

    def read_html_toc(self, toc):
        self.base_path = os.path.dirname(toc)
        soup = BeautifulSoup(open(toc, "rb").read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
        for a in soup.findAll("a"):
            if not a.has_key("href"):
                continue
            purl = urlparse(unquote(a["href"]))
            href, fragment = purl[2], purl[5]
            if not fragment:
                fragment = None
            else:
                fragment = fragment.strip()
            href = href.strip()

            txt = "".join([unicode(s).strip() for s in a.findAll(text=True)])
            add = True
            for i in self.flat():
                if i.href == href and i.fragment == fragment:
                    add = False
                    break
            if add:
                self.add_item(href, fragment, txt)

예제 #18

0

파일 보기

    def update_results(self, trigger):
        #self._log_location(trigger)
        reader_to_match = str(self.find_annotations_reader_comboBox.currentText())
        color_to_match = str(self.find_annotations_color_comboBox.currentText())
        text_to_match = str(self.find_annotations_text_lineEdit.text())
        note_to_match = str(self.find_annotations_note_lineEdit.text())

        from_date = self.find_annotations_date_from_dateEdit.dateTime().toTime_t()
        to_date = self.find_annotations_date_to_dateEdit.dateTime().toTime_t()

        annotation_map = self.annotated_books_scanner.annotation_map
        #field = self.prefs.get("cfg_annotations_destination_field", None)
        field = get_cc_mapping('annotations', 'field', None)

        db = self.opts.gui.current_db
        matched_titles = []
        self.matched_ids = set()

        for cid in annotation_map:
            mi = db.get_metadata(cid, index_is_id=True)
            soup = None
            if field == 'Comments':
                if mi.comments:
                    soup = BeautifulSoup(mi.comments)
            else:
                if mi.get_user_metadata(field, False)['#value#'] is not None:
                    soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#'])
            if soup:
                uas = soup.findAll('div', 'annotation')
                for ua in uas:
                    # Are we already logged?
                    if cid in self.matched_ids:
                        continue

                    # Check reader
                    if reader_to_match != self.GENERIC_READER:
                        this_reader = ua['reader']
                        if this_reader != reader_to_match:
                            continue

                    # Check color
                    if color_to_match != self.GENERIC_STYLE:
                        this_color = ua.find('table')['color']
                        if this_color != color_to_match:
                            continue

                    # Check date range, allow for mangled timestamp
                    try:
                        timestamp = float(ua.find('td', 'timestamp')['uts'])
                        if timestamp < from_date or timestamp > to_date:
                            continue
                    except:
                        continue

                    highlight_text = ''
                    try:
                        pels = ua.findAll('p', 'highlight')
                        highlight_text = '\n'.join([p.string for p in pels])
                    except:
                        pass
                    if text_to_match > '':
                        if not re.search(text_to_match, highlight_text, flags=re.IGNORECASE):
                            continue

                    note_text = ''
                    try:
                        nels = ua.findAll('p', 'note')
                        note_text = '\n'.join([n.string for n in nels])
                    except:
                        pass
                    if note_to_match > '':
                        if not re.search(note_to_match, note_text, flags=re.IGNORECASE):
                            continue

                    # If we made it this far, add the id to matched_ids
                    self.matched_ids.add(cid)
                    matched_titles.append(mi.title)

        # Update the results box
        matched_titles.sort()
        if len(annotation_map):
            if len(matched_titles):
                first_match = ("<i>%s</i>" % matched_titles[0])
                if len(matched_titles) == 1:
                    results = first_match
                else:
                    results = first_match + (_(" and {0} more.").format(len(matched_titles) - 1))
                self.result_label.setText('<p style="color:blue">{0}</p>'.format(results))
            else:
                self.result_label.setText('<p style="color:red">{0}</p>'.format(_('no matches')))
        else:
            self.result_label.setText('<p style="color:red">{0}</p>'.format(_('no annotated books in library')))

        self.resize_dialog()

예제 #19

0

파일 보기

파일: find_annotations.py 프로젝트: mkarpiarz/calibre-annotations

    def update_results(self, trigger):
        #self._log_location(trigger)
        reader_to_match = str(self.find_annotations_reader_comboBox.currentText())
        color_to_match = str(self.find_annotations_color_comboBox.currentText())
        text_to_match = str(self.find_annotations_text_lineEdit.text())
        note_to_match = str(self.find_annotations_note_lineEdit.text())

        from_date = self.find_annotations_date_from_dateEdit.dateTime().toTime_t()
        to_date = self.find_annotations_date_to_dateEdit.dateTime().toTime_t()

        annotation_map = self.annotated_books_scanner.annotation_map
        #field = self.prefs.get("cfg_annotations_destination_field", None)
        field = get_cc_mapping('annotations', 'field', None)

        db = self.opts.gui.current_db
        matched_titles = []
        self.matched_ids = set()

        for cid in annotation_map:
            mi = db.get_metadata(cid, index_is_id=True)
            soup = None
            if field == 'Comments':
                if mi.comments:
                    soup = BeautifulSoup(mi.comments)
            else:
                if mi.get_user_metadata(field, False)['#value#'] is not None:
                    soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#'])
            if soup:
                uas = soup.findAll('div', 'annotation')
                for ua in uas:
                    # Are we already logged?
                    if cid in self.matched_ids:
                        continue

                    # Check reader
                    if reader_to_match != self.GENERIC_READER:
                        this_reader = ua['reader']
                        if this_reader != reader_to_match:
                            continue

                    # Check color
                    if color_to_match != self.GENERIC_STYLE:
                        this_color = ua.find('table')['color']
                        if this_color != color_to_match:
                            continue

                    # Check date range, allow for mangled timestamp
                    try:
                        timestamp = float(ua.find('td', 'timestamp')['uts'])
                        if timestamp < from_date or timestamp > to_date:
                            continue
                    except:
                        continue

                    highlight_text = ''
                    try:
                        pels = ua.findAll('p', 'highlight')
                        for pel in pels:
                            highlight_text += pel.string + '\n'
                    except:
                        pass
                    if text_to_match > '':
                        if not re.search(text_to_match, highlight_text, flags=re.IGNORECASE):
                            continue

                    note_text = ''
                    try:
                        nels = ua.findAll('p', 'note')
                        for nel in nels:
                            note_text += nel.string + '\n'
                    except:
                        pass
                    if note_to_match > '':
                        if not re.search(note_to_match, note_text, flags=re.IGNORECASE):
                            continue

                    # If we made it this far, add the id to matched_ids
                    self.matched_ids.add(cid)
                    matched_titles.append(mi.title)

        # Update the results box
        matched_titles.sort()
        if len(annotation_map):
            if len(matched_titles):
                first_match = ("<i>%s</i>" % matched_titles[0])
                if len(matched_titles) == 1:
                    results = first_match
                else:
                    results = first_match + (" and %d more." % (len(matched_titles) - 1))
                self.result_label.setText('<p style="color:blue">{0}</p>'.format(results))
            else:
                self.result_label.setText('<p style="color:red">no matches</p>')
        else:
            self.result_label.setText('<p style="color:red">no annotated books in library</p>')

        self.resize_dialog()

예제 #20

0

파일 보기

파일: comments.py 프로젝트: artbycrunk/calibre

def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return u'<p></p>'
    if not isinstance(comments, unicode):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
                for x in comments.split('\n\n')]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return u'<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
        '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(lost_cr.group(),
                                    '%s%s\n\n%s' % (lost_cr.group(1),
                                                    lost_cr.group(2),
                                                    lost_cr.group(3)))

    comments = comments.replace(u'\r', u'')
    # Convert \n\n to <p>s
    comments = comments.replace(u'\n\n', u'<p>')
    # Convert solo returns to <br />
    comments = comments.replace(u'\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = BeautifulSoup(comments)
    result = BeautifulSoup()
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if type(token) is NavigableString:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc,prepare_string_for_xml(token))
            ptc += 1
        elif type(token) in (CData, Comment, Declaration,
                ProcessingInstruction):
            continue
        elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
                'hr']:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                result.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            result.insert(rtc, token)
            rtc += 1

    if open_pTag:
        result.insert(rtc, pTag)

    for p in result.findAll('p'):
        p['class'] = 'description'

    for t in result.findAll(text=True):
        t.replaceWith(prepare_string_for_xml(unicode(t)))

    return result.renderContents(encoding=None)

예제 #21

0

파일 보기

def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return u'<p></p>'
    if not isinstance(comments, unicode):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
                for x in comments.split('\n\n')]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return u'<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
        '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(lost_cr.group(),
                                    '%s%s\n\n%s' % (lost_cr.group(1),
                                                    lost_cr.group(2),
                                                    lost_cr.group(3)))

    comments = comments.replace(u'\r', u'')
    # Convert \n\n to <p>s
    comments = comments.replace(u'\n\n', u'<p>')
    # Convert solo returns to <br />
    comments = comments.replace(u'\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = BeautifulSoup(comments)
    result = BeautifulSoup()
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if type(token) is NavigableString:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc,prepare_string_for_xml(token))
            ptc += 1
        elif type(token) in (CData, Comment, Declaration,
                ProcessingInstruction):
            continue
        elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
                'hr']:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                result.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            result.insert(rtc, token)
            rtc += 1

    if open_pTag:
        result.insert(rtc, pTag)

    for p in result.findAll('p'):
        p['class'] = 'description'

    for t in result.findAll(text=True):
        t.replaceWith(prepare_string_for_xml(unicode(t)))

    return result.renderContents(encoding=None)

예제 #22

0

파일 보기

파일: comments.py 프로젝트: GRiker/calibre

def comments_to_html(comments):
    """
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    """
    if not comments:
        return u"<p></p>"
    if not isinstance(comments, unicode):
        comments = comments.decode(preferred_encoding, "replace")

    if comments.lstrip().startswith("<"):
        # Comment is already HTML do not mess with it
        return comments

    if "<" not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>' % x.replace(u"\n", u"<br />") for x in comments.split("\n\n")]
        return "\n".join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback

            traceback.print_exc()
            return u"<p></p>"

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace(".", ".\r"), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(
            lost_cr.group(), "%s%s\n\n%s" % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))
        )

    comments = comments.replace(u"\r", u"")
    # Convert \n\n to <p>s
    comments = comments.replace(u"\n\n", u"<p>")
    # Convert solo returns to <br />
    comments = comments.replace(u"\n", "<br />")
    # Convert two hyphens to emdash
    comments = comments.replace("--", "&mdash;")

    soup = BeautifulSoup(comments)
    result = BeautifulSoup()
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if type(token) is NavigableString:
            if not open_pTag:
                pTag = Tag(result, "p")
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, prepare_string_for_xml(token))
            ptc += 1
        elif type(token) in (CData, Comment, Declaration, ProcessingInstruction):
            continue
        elif token.name in ["br", "b", "i", "em", "strong", "span", "font", "a", "hr"]:
            if not open_pTag:
                pTag = Tag(result, "p")
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                result.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            result.insert(rtc, token)
            rtc += 1

    if open_pTag:
        result.insert(rtc, pTag)

    for p in result.findAll("p"):
        p["class"] = "description"

    for t in result.findAll(text=True):
        t.replaceWith(prepare_string_for_xml(unicode(t)))

    return result.renderContents(encoding=None)