Ejemplo n.º 1
0
    def get_url(self, lang, quality):
        obj = select(self.document.getroot(), 'object', 1)
        movie_url = select(obj, 'param[name=movie]', 1)
        xml_url = urllib.unquote(movie_url.attrib['value'].split('videorefFileUrl=')[-1])

        doc = self.browser.get_document(self.browser.openurl(xml_url))
        videos_list = select(doc.getroot(), 'video')
        videos = {}
        for v in videos_list:
            videos[v.attrib['lang']] = v.attrib['ref']

        if lang in videos:
            xml_url = videos[lang]
        else:
            xml_url = videos.popitem()[1]

        doc = self.browser.get_document(self.browser.openurl(xml_url))

        obj = select(doc.getroot(), 'urls', 1)
        videos_list = select(obj, 'url')
        urls = {}
        for v in videos_list:
            urls[v.attrib['quality']] = v.text

        if quality in urls:
            video_url = urls[quality]
        else:
            video_url = urls.popitem()[1]

        return video_url
Ejemplo n.º 2
0
    def iter_videos(self):
        try:
            ul = select(self.document.getroot(), 'div.container-videos ul', 1)
        except SelectElementException:
            # It means there are no results.
            return
        for li in ul.findall('li'):
            id = re.sub(r'/video/(.+)\.html', r'\1', li.find('a').attrib['href'])

            thumbnail = 'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src']

            title = select(li, 'p.titre', 1).text

            date = select(li, 'p.date', 1).text
            day, month, year = [int(s) for s in date.split('/')]
            date = datetime.datetime(year, month, day)

            duration = select(li, 'p.duree', 1).text
            m = re.match(r'((\d+)h)?((\d+)min)?(\d+)s', duration)
            if m:
                duration = datetime.timedelta(hours=int(m.group(2) or 0), minutes=int(m.group(4) or 0), seconds=int(m.group(5)))
            else:
                raise SelectElementException('Unable to match duration (%r)' % duration)

            yield InaVideo(id,
                           title=title,
                           date=date,
                           duration=duration,
                           thumbnail_url=thumbnail,
                          )
Ejemplo n.º 3
0
    def iter_videos(self):
        span_list = select(self.document.getroot(), 'span#miniatura')
        for span in span_list:
            a = select(span, 'a', 1)
            url = a.attrib['href']
            _id = re.sub(r'/videos/(.+)\.html', r'\1', url)

            thumbnail_url = span.find('.//img').attrib['src']

            title_el = select(span, 'span#title1', 1)
            title = title_el.text.strip()

            time_span = select(span, 'span.thumbtime span', 1)
            time_txt = time_span.text.strip()
            if time_txt == 'N/A':
                minutes, seconds = 0, 0
            elif ':' in time_txt:
                minutes, seconds = (int(v) for v in time_txt.split(':'))
            else:
                raise SelectElementException('Unable to parse the video duration: %s' % time_txt)


            yield YoujizzVideo(_id,
                               title=title,
                               duration=datetime.timedelta(minutes=minutes, seconds=seconds),
                               thumbnail_url=thumbnail_url,
                               )
Ejemplo n.º 4
0
 def get_current(self, radio):
     if radio == 'general':
         _radio = ''
     else:
         _radio = '_%s' % radio
     title = select(self.document.getroot(), 'div#titre%s' % _radio, 1).text.strip()
     artist = select(self.document.getroot(), 'div#artiste%s' % _radio, 1).text.strip()
     return unicode(artist), unicode(title)
Ejemplo n.º 5
0
    def get_video(self, video=None):
        _id = to_unicode(self.group_dict["id"])
        if video is None:
            video = YoujizzVideo(_id)
        title_el = select(self.document.getroot(), "title", 1)
        video.title = to_unicode(title_el.text.strip())

        # youjizz HTML is crap, we must parse it with regexps
        data = lxml.html.tostring(self.document.getroot())
        m = re.search(r"<strong>.*?Runtime.*?</strong> (.+?)<br.*>", data)
        try:
            if m:
                minutes, seconds = (int(v) for v in to_unicode(m.group(1).strip()).split(":"))
                video.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
            else:
                raise Exception()
        except Exception:
            raise SelectElementException("Could not retrieve video duration")

        video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
        if len(video_file_urls) == 0:
            raise SelectElementException("Video URL not found")
        elif len(video_file_urls) > 1:
            raise SelectElementException("Many video file URL found")
        else:
            video.url = video_file_urls[0]

        return video
Ejemplo n.º 6
0
    def iter_videos(self):
        videos = self.document.getroot().cssselect("div[class=video]")
        for div in videos:
            title = div.find('h2').find('a').text
            m = re.match(r'/fr/videos/(.*)\.html', div.find('h2').find('a').attrib['href'])
            _id = ''
            if m:
                _id = m.group(1)
            rating = rating_max = 0
            rates = select(div, 'div[class=rateContainer]', 1)
            for r in rates.findall('div'):
                if 'star-rating-on' in r.attrib['class']:
                    rating += 1
                rating_max += 1

            thumb = select(div, 'img[class=thumbnail]', 1)
            thumbnail_url = 'http://videos.arte.tv' + thumb.attrib['src']

            yield ArteVideo(_id,
                            title=title,
                            rating=rating,
                            rating_max=rating_max,
                            thumbnail_url=thumbnail_url)
Ejemplo n.º 7
0
    def set_details(self, v):
        details_div = select(self.document.getroot(), '#details', 1)
        for li in details_div.getiterator('li'):
            span = li.find('span')
            name = span.text.strip()
            value = span.tail.strip()

            if name == 'Duration:':
                seconds = minutes = 0
                for word in value.split():
                    if word.endswith('min'):
                        minutes = int(word[:word.find('min')])
                    elif word.endswith('sec'):
                        seconds = int(word[:word.find('sec')])
                v.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
            elif name == 'Submitted:':
                author = li.find('i')
                if author is None:
                    author = li.find('a')
                if author is None:
                    v.author = value
                else:
                    v.author = author.text
            elif name == 'Rating:':
                r = value.split()
                v.rating = float(r[0])
                v.rating_max = float(r[2])
            elif name == 'Date:':
                m = self.DATE_REGEXP.match(value)
                if m:
                    month = self.MONTH2I.index(m.group(1))
                    day = int(m.group(2))
                    hour = int(m.group(3))
                    minute = int(m.group(4))
                    second = int(m.group(5))
                    year = int(m.group(6))
                    v.date = datetime.datetime(year, month, day, hour, minute, second)
Ejemplo n.º 8
0
 def get_title(self):
     element = select(self.document.getroot(), "meta[name=title]", 1)
     return to_unicode(element.attrib["content"].strip())
Ejemplo n.º 9
0
 def get_author(self):
     element = select(self.document.getroot(), "a.watch-description-username strong", 1)
     return element.text.strip()
Ejemplo n.º 10
0
 def get_video(self, video=None):
     element = select(self.document.getroot(), ".yt-alert-content", 1)
     raise ForbiddenVideo(element.text.strip())
Ejemplo n.º 11
0
 def get_source(self):
     return select(self.document.getroot(), 'textarea#content_text', 1).text
Ejemplo n.º 12
0
 def get_title(self):
     element = select(self.document.getroot(), '#videoArea h1', 1)
     return unicode(element.getchildren()[0].tail).strip()
Ejemplo n.º 13
0
 def get_url(self):
     download_div = select(self.document.getroot(), '#download', 1)
     a = select(download_div, 'a', 1)
     return a.attrib['href']
Ejemplo n.º 14
0
 def get_nb_remaining_free_sms(self):
     remaining_regex = re.compile(u'Il vous reste (?P<nb>.+) Texto gratuits vers les numéros SFR à envoyer aujourd\'hui')
     text = select(self.document.getroot(), '#smsReminder', 1).text.strip()
     return remaining_regex.match(text).groupdict().get('nb')