def get_content(self): # merge text content text = '%s\n%s' % (self.get_title().encode('utf-8'), self.get_summary().encode('utf-8')) text = sweep(to_unicode(text)) # remove html tags content = lxml.html.fromstring(text).text_content() return content
def request(self, url): try: self.req = urllib.urlopen(url) except: # unable to download # something error has happened. return 0 doc = self.clean(to_unicode(self.req.read())) try: self.dom = lxml.html.fromstring(doc) except: # something error has happened by lxml.html. return 0 return self.req.getcode()