def get_episodes(self, channel): url = 'http://www.youtube.com/user/CanalACulturaActiva/feed' j = 1 shows = {} html = urllib.urlopen(url).read() dom = lxml.html.document_fromstring(html) for item in dom.cssselect('.feed-item-main'): p = [x.strip() for x in item.cssselect('h4')[0].text_content().split('-')] show_title = p[0] episode_title = '-'.join(p[1:]) try: serie = Serie.objects.get(name=show_title) except Serie.DoesNotExist: serie = Serie(name=show_title) print ">> SERIE: %s" % show_title.encode('utf8') serie.channel = channel serie.save() serie.genres.add(Genre.objects.get_or_create(code='CULTURA',defaults={'name':'Cultura'})[0]) if Episode.objects.filter(name=episode_title).count() > 0: continue episode = Episode(serie=serie, name=episode_title, number=0) print "%s" % episode_title episode.description = item.cssselect('.description')[0].text_content() + "\n" + \ item.cssselect('.video-time')[0].text_content() episode.thumbnail = urllib.basejoin(self.BASE_URL, item.cssselect('.video-thumb img')[0].get('src')) episode.save() url2 = item.cssselect('a')[0].get('href') video_id = re.findall('v=([^&]+)', url2)[0] video_url = get_youtube_url(video_id) media = HttpMediaFile(width=640, height=480, mimetype='video/mp4', url=video_url) media.episode = episode media.save() serie.episode_set.add(episode)
def scrap_episode(self, serie, url, thumbnail): if Episode.objects.filter(url=url).count() > 0: print "EXISTS" return print url html = urllib.urlopen(url).read() soup = BeautifulSoup(html, from_encoding='utf-8') info = soup.find('article','info') p = soup.find('div','description').text season,number,duration = re.findall('Temporada ([0-9]+) \| Ep. ([0-9]+) \(([0-9:]+)\)', p)[0] if Episode.objects.filter(serie=serie, season=int(season), number=int(number)).count() > 0: print "EXISTS" return episode = Episode() episode.serie = serie episode.season = int(season) episode.number = int(number) episode.duration = time(0,*map(int,duration.split(':'))) episode.description = soup.find('div','description').p.children.next() episode.thumbnail = thumbnail episode.save() media = MundoFoxMediaFile() smil_url = re.findall('player.releaseUrl = "([^"]+)"', html)[0] smil_url += "&manifest=m3u&format=SMIL&Tracking=true&Embedd=true" media._url = smil_url media.episode = episode media.save()
def scrap_serie(self, serie): html = urllib.urlopen(serie.url).read() html = re.sub('<\?.*?\?>','',html) soup = BeautifulSoup(html, from_encoding='utf8') for cnt,article in enumerate(soup('td','listado-mediateca-menu')): episode = Episode() episode.serie = serie episode.thumbnail = urllib.basejoin(self.base_url, article.find('div','imagen-mediateca').img.get('src')) episode.name = article.find('h5','titulo-mediateca').a.text print episode.name.encode('utf8') episode.description = article.find('span','texto-mediateca').text episode_url = urllib.basejoin(self.base_url, article.find('h5','titulo-mediateca').a.get('href')) episode.number = cnt+1 episode.season = 1 episode.save() self.scrap_episode(episode, episode_url)