Beispiel #1
0
 def get_content(self):
     # merge text content
     text = '%s\n%s' % (self.get_title().encode('utf-8'), self.get_summary().encode('utf-8'))
     text = sweep(to_unicode(text))
     # remove html tags
     content = lxml.html.fromstring(text).text_content()
     return content
Beispiel #2
0
 def request(self, url):
     try:
         self.req = urllib.urlopen(url)
     except:
         # unable to download
         # something error has happened.
         return 0
     doc = self.clean(to_unicode(self.req.read()))
     try:
         self.dom = lxml.html.fromstring(doc)
     except:
         # something error has happened by lxml.html.
         return 0
     return self.req.getcode()
Beispiel #3
0
 def request(self, url):
     try:
         self.req = urllib.urlopen(url)
     except:
         # unable to download
         # something error has happened.
         return 0
     doc = self.clean(to_unicode(self.req.read()))
     try:
         self.dom = lxml.html.fromstring(doc)
     except:
         # something error has happened by lxml.html.
         return 0
     return self.req.getcode()