def get_episodes(self, channel): url = 'http://www.youtube.com/user/CanalACulturaActiva/feed' j = 1 shows = {} html = urllib.urlopen(url).read() dom = lxml.html.document_fromstring(html) for item in dom.cssselect('.feed-item-main'): p = [x.strip() for x in item.cssselect('h4')[0].text_content().split('-')] show_title = p[0] episode_title = '-'.join(p[1:]) try: serie = Serie.objects.get(name=show_title) except Serie.DoesNotExist: serie = Serie(name=show_title) print ">> SERIE: %s" % show_title.encode('utf8') serie.channel = channel serie.save() serie.genres.add(Genre.objects.get_or_create(code='CULTURA',defaults={'name':'Cultura'})[0]) if Episode.objects.filter(name=episode_title).count() > 0: continue episode = Episode(serie=serie, name=episode_title, number=0) print "%s" % episode_title episode.description = item.cssselect('.description')[0].text_content() + "\n" + \ item.cssselect('.video-time')[0].text_content() episode.thumbnail = urllib.basejoin(self.BASE_URL, item.cssselect('.video-thumb img')[0].get('src')) episode.save() url2 = item.cssselect('a')[0].get('href') video_id = re.findall('v=([^&]+)', url2)[0] video_url = get_youtube_url(video_id) media = HttpMediaFile(width=640, height=480, mimetype='video/mp4', url=video_url) media.episode = episode media.save() serie.episode_set.add(episode)
def get_shows(self, channel, url, params): while True: print "PAGE %d" % params['pagina'] real_url = "%s?%s" % (url,urllib.urlencode(params)) html = urllib.urlopen(real_url).read() soup = BeautifulSoup(html,from_encoding='latin-1') answer = [] found = False for dataitem in soup('div','resBusqueda'): found = True name = dataitem.h1.a.text.strip() print "%s..." % name.encode('utf8') if channel.serie_set.filter(name=name).count() > 0: print "EXIST" continue serie = Serie() serie.channel = channel serie.name = name serie.thumbnail = dataitem.find('div','resBusqueda_thumb').img.get('src') serie.description = dataitem.p.text serie.url = urllib.basejoin(self.BASE_URL, dataitem.h1.a.get('href')) #self.get_episodes(serie, serie_url) serie.save() print "OK" if not found: break params['pagina'] += 1
def scrap_channel(self, channel): url = channel.urls[0] i = url.index('?') base_url = url[:i] params = dict(urlparse.parse_qsl(url[i+1:])) if 'page' not in params: params['page'] = 1 params['page'] = int(params['page']) found = True while found: found = False print "PAGE %d" % params['page'] real_url = "%s?%s" % (base_url,urllib.urlencode(params)) html = urllib.urlopen(real_url).read() soup = BeautifulSoup(html, from_encoding='utf8') try: for elem in soup('div','playlist-metadata'): found = True serie = Serie() serie.channel = channel serie.name = elem.h3.a.text.strip() print serie.name.encode("utf8") serie.url = urllib.basejoin(self.BASE_URL, elem.h3.a.get('href')) serie.save() params['page'] += 1 except: pass
def scrap_channel(self, channel, url): self.get_player_key() serie = Serie() serie.name = "Karlos Arguiñano en tu cocina" serie.thumbnail = "http://static.hogarutil.com/archivos/201109/logotipo-karlos-arguinano-2012-173x125x80xX.jpg?1" serie.channel = channel serie.save() url = "http://www.hogarutil.com/tv/programas/karlos-arguinano-cocina/index.html" self.get_show(serie, url)
def scrap_channel(self, channel): html = urllib.urlopen(channel.urls[0]).read() html = re.sub('<\?.*?\?>','',html) soup = BeautifulSoup(html) for elem in soup('div','item-menu'): serie = Serie() serie.channel = channel serie.name = elem.a.text serie.url = urllib.basejoin('http://www.tvpublica.com.ar/tvpublica/',elem.a.get('href')) serie.save()
def get_serie(self, channel, genre, url): if Serie.objects.filter(url=url).count() > 0: print "EXISTS" return html = urllib.urlopen(url).read() soup = BeautifulSoup(html, from_encoding='utf-8') info = soup.find('article','info') serie_name = info.strong.text.strip() if Serie.objects.filter(name=serie_name).count() > 0: print "EXISTS" return serie = Serie(channel=channel) serie.name = serie_name serie.url = url serie.thumbnail = urllib.basejoin(self.BASE_URL, info.img.get('src')) serie.description = info.find('div','expandable').text serie.save() serie.genres.add(genre)