def obj_id(self): id = Regexp(CleanText('./a/@href'), '//www.france.tv/(.*)/', default=None)(self) if not id: id = CleanText('.')(self) id = id.encode('ascii', 'ignore') id = hashlib.md5(id).hexdigest() id = u'vid_%s' % id return id
def __init__(self, *args, **kwargs): HTMLPage.__init__(self, *args, **kwargs) json_content = Regexp( CleanText('//script'), r"window\[\"initialData\"\] = JSON.parse\(\"({.*})\"\);window\[\"tags\"\]" )(self.doc) json_content = codecs.unicode_escape_decode(json_content)[0] json_content = json_content.encode('utf-8', 'surrogatepass').decode('utf-8') self.doc = json.loads(json_content)