Beispiel #1
0
class ProfileGrab:
  def __init__(self, uri):
    self.fetcher = DiskCacheFetcher('/tmp')
    self.uri = uri
    self.data = self.fetcher.fetch(uri, 43200)
    # presume it's html for now, we'll refactor to take into account other stuff
    self.soup = BeautifulSoup(self.data)
    self.author = None
    
    # 1. grab meta-author and meta-dc.creator tags
    metaauthor = self.meta_author()
    if metaauthor is not None:
      self.author = metaauthor
    
    # hCard
    if self.author is None:
      self.hcard()
    
    # 4. Look for RSS feeds and parse the names out of there
    if self.author is None:
      self.detectRss()
    
    if self.author is None:
      self.mailtoLinkDetect()
  
  def hcard(self):
    # declare
    regex_uid = re.compile('uid')
    regex_url = re.compile('url')
    regex_fn = re.compile('fn')
    
    self.loadHcards()
    selectedCard = None
    if len(self.hcards) is 1:
      selectedCard = self.hcards[0]
    else:
      for card in self.hcards:
        if card.findAll(['a', 'link'], {'class': regex_uid}):
          if card.findAll(['a', 'link'], {'class': regex_uid})[0]['href'] is self.uri:
            selectedCard = card
        else:
          for url in card.findAll(['a', 'link'], {'class': regex_url}):
            if url['href'] is self.uri:
              selectedCard = card
    if selectedCard is not None:
      selectedNames = selectedCard.findAll(True, {'class': regex_fn})
      if len(selectedNames) is 1 and self.author is None:
        self.author = unicode(''.join(selectedNames[0].findAll(text=True)).strip())
    
  
  def getFoafFromHtml(self):
    # get rdf/xml links
    data = rdflib.ConjunctiveGraph()
    for i in self.soup.query("link", {"type": "application/rdf+xml", "href": True}):
      data.add(str(i['href']))
    
    # get n3 links
    for i in self.soup.query("link", {"type": "application/n3", "href": True}):
      data.add(str(i['href']), 'n3')
    
    if len(data) is not 0:
      return data
    else:
      return False
  
  def getNameFromFoaf(self, uri):
    foaf = rdflib.ConjunctiveGraph(uri)
    queryString = """PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    SELECT ?name WHERE {
      OPTIONAL {
        ?g a foaf:PersonalProfileDocument;
        foaf:primaryTopic ?person .
      }
      ?person foaf:name ?name .
    }"""
    results = foaf.query(queryString)
  
  def meta_author(self):
    authornames = self.soup.findAll('meta', {'name': ['author', 'DC.creator']})
    if len(authornames) is not 0:
      return unicode(authornames[0]['content'])
    else:
      return None
  
  def loadHcards(self):
    regex_hcard = re.compile('vcard')
    self.hcards = self.soup.findAll(True, {'class': regex_hcard})
  
  def detectRss(self):
    self.rss_feed = autorss.getRSSLinkFromHTMLSource(self.data)
    firstfeed = feedparser.parse(self.rss_feed, 43200)
    if hasattr(firstfeed, 'author_detail') is True:
      if firstfeed.author_detail.name is not None:
        self.author = unicode(firstfeed.author_detail.name)
    else:
      authorarray = []
      for i in firstfeed.entries:
        if hasattr(i, 'author'):
          authorarray += [i.author]
      if len(authorarray) is not 0:
        self.author = unicode(getMostPopularFromList(authorarray))
  
  def mailtoLinkDetect(self):
    mailtoLinks = self.soup.findAll(['a', 'link'], {'href': re.compile('mailto:')})
    if len(mailtoLinks) is not 0:
      self.author = unicode(mailtoLinks[0].contents[0])