def scrap_serie(self, serie): html = urllib.urlopen(serie.url).read() html = re.sub('<\?.*?\?>','',html) soup = BeautifulSoup(html, from_encoding='utf8') for cnt,article in enumerate(soup('td','listado-mediateca-menu')): episode = Episode() episode.serie = serie episode.thumbnail = urllib.basejoin(self.base_url, article.find('div','imagen-mediateca').img.get('src')) episode.name = article.find('h5','titulo-mediateca').a.text print episode.name.encode('utf8') episode.description = article.find('span','texto-mediateca').text episode_url = urllib.basejoin(self.base_url, article.find('h5','titulo-mediateca').a.get('href')) episode.number = cnt+1 episode.season = 1 episode.save() self.scrap_episode(episode, episode_url)
def scrap_serie(self, serie): html = urllib.urlopen(serie.url).read() soup = BeautifulSoup(html, from_encoding='utf8') for cnt,article in enumerate(soup.find('div','primary-pane').ol('li')): episode = Episode() episode.thumbnail = article.img.get('src') episode.name = article.find('span','video-overview').span.text print episode.name.encode('utf8') dur = article.find('span','video-time').text.split(':') episode.duration = time(0, int(dur[0]), int(dur[1])) episode.serie = serie episode.number = cnt+1 episode.season = 1 episode.save() video_id = re.findall('v=([^&]+)', article.a.get('href'))[0] media = HttpMediaFile() media.url = self.get_real_url(video_id) media.episode = episode media.save()
def scrap_serie(self, serie): #serie_url = "http://www.conectate.gob.ar/educar-portal-video-web/module/detalleRecurso/DetalleRecurso.do?canalId=1&modulo=menu&temaCanalId=1&tipoEmisionId=3&idRecurso=50123" try: html = urllib.urlopen(serie.url).read() soup = BeautifulSoup(html, from_encoding='latin-1') r = re.compile("nero:\t +(\w+)", re.DOTALL|re.UNICODE).findall(soup.find('div','titSerieEncabezado').text) if r: serie.genres.add(Genre.objects.get_or_create(name=r[0])[0]) for seasonelem in soup('ul','serieCap'): c = seasonelem.get('id')[-1] season = int(c) if c.isdigit() else 1 for cnt,episodeelem in enumerate(seasonelem("li")): episode = Episode() episode.serie = serie episode.name = episodeelem.a.text episode.number = cnt+1 episode_url = urllib.basejoin(self.BASE_URL, episodeelem.a.get("href")) episode.save() self.get_episode(episode, episode_url) print "EPISODE S%02dE%02d: %s" % (episode.season, episode.number, episode.name.encode('utf8')) except Exception,e: print e
def get_episode(self, serie, url): cnt = serie.episode_set.count()+1 html = urllib.urlopen(url).read() dom = lxml.html.document_fromstring(html) for elem in dom.cssselect("#ms-player-thumb-videos ul li"): episode = Episode() episode.serie = serie episode.number = cnt cnt += 1 episode.name = elem.cssselect(".ms-thumb-titulo")[0].text_content() print episode.name episode.thumbnail = elem.cssselect(".ms-thumb-img img")[0].get('src') episode.save() num = re.findall("\(([0-9])\)", elem.cssselect("a")[0].get('onclick'))[0] elemscript = dom.cssselect("#ms-player2-%s" % num)[0].getnext() sig = re.findall('"(.*?)"', elemscript.text_content())[1] media = HttpMediaFile() media.episode = episode media.url = "http://api.kewego.com/video/getHTML5Stream?playerKey=%s&sig=%s&format=normal" % (self.player_key,sig) media.save()