Python BeautifulSoup.findAll Exemples, libs.BeautifulSoup.BeautifulSoup.findAll Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : utils.py Projet : caomw/xiakelite

def pygments_markdown(content):
    _lexer_names = reduce(lambda a,b: a + b[2], lexers.LEXERS.itervalues(), ())
    _formatter = formatters.HtmlFormatter(cssclass='highlight')

    html = markdown.markdown(content)
    # Using html.parser to prevent bs4 adding <html> tag
    soup = BeautifulSoup(html)
    for tag in ("script", "html", "head", "title", "div", "hr", "article", "header", "footer"):
        if soup.findAll(tag):
            return escape(content)
    for pre in soup.findAll('pre'):
        if pre.code:
            txt = unicode(pre.code.text)
            lexer_name = "text"
            if txt.startswith(':::'):
                lexer_name, txt = txt.split('\n', 1)
                lexer_name = lexer_name.split(':::')[1]

            if lexer_name not in _lexer_names:
                lexer_name = "text"
            lexer = lexers.get_lexer_by_name(lexer_name, stripnl=True, encoding='UTF-8')
            if txt.find("&lt;") != -1 or txt.find("&gt;") != -1:
                txt = txt.replace("&lt;", "<").replace("&gt;", ">")
            if txt.find("&amp;") != -1:
                txt = txt.replace("&amp;", "&")
            highlighted = highlight(txt, lexer, _formatter)
            div_code = BeautifulSoup(highlighted).div
            if not div_code:
                return content
            pre.replaceWith(div_code)
    return unicode(soup)

Exemple #2

0

Afficher le fichier

Fichier : bookmarklet.py Projet : iMax-pp/pintme

    def get(self):

        shareURL = urllib.unquote(self.request.get('u'))
        shareSelection = urllib.unquote(self.request.get('s'))

        if shareURL.startswith('https'):
            path = os.path.join(os.path.dirname(__file__), '../views/bookmarklet-exit.html')
            self.response.out.write(template.render(path,{'message':'Oops! This is a secure page :('}))

        shareURLParts = urlparse.urlparse(shareURL)
        if shareURLParts[2] != '':
            shareURLDir = re.search('(/.*)',shareURLParts[2]).group(0)
        else:
            shareURLDir = ''
        page = urlfetch.fetch(shareURL)
        pageSoup = BeautifulSoup(page.content)

        try:
            shareTitle =  pageSoup.html.head.title.string
        except AttributeError:
            shareTitle = urllib.unquote(self.request.get('t'))

        pageImgs = pageSoup.findAll('img')

        for image in pageImgs:
            image['src'] = urlparse.urljoin(shareURL, image['src'])

        template_values = {
            'url': shareURL,
            'title': shareTitle,
            'selection': shareSelection,
            'images': pageImgs
        }

        # We get the template path then show it
        path = os.path.join(os.path.dirname(__file__), '../views/bookmarklet.html')
        self.response.out.write(template.render(path, template_values))

Exemple #3

0

Afficher le fichier

Fichier : rss-feeds-extractor.py Projet : claudiomartella/WheresTheJuice

    if len(sys.argv) != 2:
        print "should pass the inputfilename"
        sys.exit()
        
    for url in file(sys.argv[1]):
        # ignore comments
        url = url.strip()
        if url[0] == '#':
            continue

        #print "Processing: " + url
        
        html = urllib2.urlopen(url.strip())
        soup = BeautifulSoup(html)
        
        res = soup.findAll('link', rel='alternate', attrs={'type': re.compile("^application/(atom|rss)\+xml")})
        if len(res) == 0:
            #print "Couldn't find the Feed!"
            continue
        
        href = res[0]['href']
        
        # relative link?
        if not href.startswith("http"):
            link = urljoin(url, href)
        else:
            link = href
        
        print link