Esempi in Python per BeautifulSoup.query

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: BeautifulSoup

Classe/tipologia: BeautifulSoup

Metodo/funzione: query

Esempi su hotexamples.com: 1

BeautifulSoup.query in Python: 1 esempio trovato. Questo è il miglior esempio reale in Python per BeautifulSoup.BeautifulSoup.query, estratto da progetti open source. Lo puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

BeautifulSoup(30)

decompose(30)

first(30)

find_all(30)

findAll(30)

find(30)

fetch(30)

feed(30)

getText(29)

insert(20)

findChildren(19)

body(12)

close(11)

__str__(11)

encode(8)

new_tag(6)

findChild(5)

append(4)

prettify(4)

findSelect(4)

decode(4)

get(4)

__unicode__(3)

goahead(3)

lower(3)

div(3)

findall(3)

pretify(3)

__init__(3)

firstText(2)

pop(2)

data(2)

findNext(2)

read(2)

index(1)

html(1)

query(1)

json(1)

load(1)

re_left(1)

noscript(1)

orig_url(1)

partition(1)

popTag(1)

pretiffy(1)

head(1)

findNextSiblings(1)

group(1)

encodeContents(1)

attrs(1)

Esempio n. 1

Mostra file

File: ProfileGrab.py Progetto: tommorris/pyprofile

class ProfileGrab:
  def __init__(self, uri):
    self.fetcher = DiskCacheFetcher('/tmp')
    self.uri = uri
    self.data = self.fetcher.fetch(uri, 43200)
    # presume it's html for now, we'll refactor to take into account other stuff
    self.soup = BeautifulSoup(self.data)
    self.author = None
    
    # 1. grab meta-author and meta-dc.creator tags
    metaauthor = self.meta_author()
    if metaauthor is not None:
      self.author = metaauthor
    
    # hCard
    if self.author is None:
      self.hcard()
    
    # 4. Look for RSS feeds and parse the names out of there
    if self.author is None:
      self.detectRss()
    
    if self.author is None:
      self.mailtoLinkDetect()
  
  def hcard(self):
    # declare
    regex_uid = re.compile('uid')
    regex_url = re.compile('url')
    regex_fn = re.compile('fn')
    
    self.loadHcards()
    selectedCard = None
    if len(self.hcards) is 1:
      selectedCard = self.hcards[0]
    else:
      for card in self.hcards:
        if card.findAll(['a', 'link'], {'class': regex_uid}):
          if card.findAll(['a', 'link'], {'class': regex_uid})[0]['href'] is self.uri:
            selectedCard = card
        else:
          for url in card.findAll(['a', 'link'], {'class': regex_url}):
            if url['href'] is self.uri:
              selectedCard = card
    if selectedCard is not None:
      selectedNames = selectedCard.findAll(True, {'class': regex_fn})
      if len(selectedNames) is 1 and self.author is None:
        self.author = unicode(''.join(selectedNames[0].findAll(text=True)).strip())
    
  
  def getFoafFromHtml(self):
    # get rdf/xml links
    data = rdflib.ConjunctiveGraph()
    for i in self.soup.query("link", {"type": "application/rdf+xml", "href": True}):
      data.add(str(i['href']))
    
    # get n3 links
    for i in self.soup.query("link", {"type": "application/n3", "href": True}):
      data.add(str(i['href']), 'n3')
    
    if len(data) is not 0:
      return data
    else:
      return False
  
  def getNameFromFoaf(self, uri):
    foaf = rdflib.ConjunctiveGraph(uri)
    queryString = """PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    SELECT ?name WHERE {
      OPTIONAL {
        ?g a foaf:PersonalProfileDocument;
        foaf:primaryTopic ?person .
      }
      ?person foaf:name ?name .
    }"""
    results = foaf.query(queryString)
  
  def meta_author(self):
    authornames = self.soup.findAll('meta', {'name': ['author', 'DC.creator']})
    if len(authornames) is not 0:
      return unicode(authornames[0]['content'])
    else:
      return None
  
  def loadHcards(self):
    regex_hcard = re.compile('vcard')
    self.hcards = self.soup.findAll(True, {'class': regex_hcard})
  
  def detectRss(self):
    self.rss_feed = autorss.getRSSLinkFromHTMLSource(self.data)
    firstfeed = feedparser.parse(self.rss_feed, 43200)
    if hasattr(firstfeed, 'author_detail') is True:
      if firstfeed.author_detail.name is not None:
        self.author = unicode(firstfeed.author_detail.name)
    else:
      authorarray = []
      for i in firstfeed.entries:
        if hasattr(i, 'author'):
          authorarray += [i.author]
      if len(authorarray) is not 0:
        self.author = unicode(getMostPopularFromList(authorarray))
  
  def mailtoLinkDetect(self):
    mailtoLinks = self.soup.findAll(['a', 'link'], {'href': re.compile('mailto:')})
    if len(mailtoLinks) is not 0:
      self.author = unicode(mailtoLinks[0].contents[0])