Python parse_googleの例、deaths_lib.scraper_utils.parse_google Pythonの例

コード例 #1

0

ファイルを表示

ファイル: dayt_scraper.py プロジェクト: uguer30/Project

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        sources = []
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        page_url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(page_url, cache_limit=1)
        iframes = dom_parser2.parse_dom(html, 'iframe', req='src')
        for attrs, _content in iframes:
            iframe_url = attrs['src']
            if 'docs.google.com' in iframe_url:
                sources = scraper_utils.parse_google(self, iframe_url)
                break
            else:
                iframe_url = scraper_utils.urljoin(self.base_url, iframe_url)
                html = self._http_get(iframe_url, cache_limit=1)
                iframes += dom_parser2.parse_dom(html, 'iframe', req='src')

        for source in sources:
            host = scraper_utils.get_direct_hostname(self, source)
            hoster = {
                'multi-part': False,
                'host': host,
                'class': self,
                'quality': scraper_utils.gv_get_quality(source),
                'views': None,
                'rating': None,
                'url': source,
                'direct': True
            }
            hosters.append(hoster)

        return hosters

コード例 #2

0

ファイルを表示

ファイル: pubfilm_scraper.py プロジェクト: uguer30/Project

 def __get_gk_links(self, link, iframe_url):
     sources = {}
     data = {'link': link}
     headers = XHR
     headers.update({'Referer': iframe_url, 'User-Agent': USER_AGENT})
     html = self._http_get(GK_URL, data=data, headers=headers, cache_limit=.25)
     js_data = scraper_utils.parse_json(html, GK_URL)
     if 'link' in js_data:
         if isinstance(js_data['link'], basestring):
             stream_url = js_data['link']
             if scraper_utils.get_direct_hostname(self, stream_url) == 'gvideo':
                 for source in scraper_utils.parse_google(self, stream_url):
                     sources[source] = {'quality': scraper_utils.gv_get_quality(source), 'direct': True}
             else:
                 sources[stream_url] = {'quality': QUALITIES.HIGH, 'direct': False}
         else:
             for link in js_data['link']:
                 stream_url = link['link']
                 if scraper_utils.get_direct_hostname(self, stream_url) == 'gvideo':
                     quality = scraper_utils.gv_get_quality(stream_url)
                 elif 'label' in link:
                     quality = scraper_utils.height_get_quality(link['label'])
                 else:
                     quality = QUALITIES.HIGH
                 sources[stream_url] = {'quality': quality, 'direct': True}
     return sources

コード例 #3

0

ファイルを表示

ファイル: mehliz_scraper.py プロジェクト: uguer30/Project

    def get_sources(self, video):
        hosters = []
        source_url = self.get_url(video)
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        page_url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(page_url, cache_limit=8)
        fragment = dom_parser2.parse_dom(html, 'div', {'class': 'playex'})
        if fragment: html = fragment[0].content
        iframe_url = dom_parser2.parse_dom(html, 'iframe', req='src')
        if not iframe_url: return hosters
        iframe_url = iframe_url[0].attrs['src']
        if iframe_url.startswith('/'):
            iframe_url = scraper_utils.urljoin(self.base_url, iframe_url)
        html = self._http_get(iframe_url,
                              headers={'Referer': page_url},
                              cache_limit=.5)
        obj = dom_parser2.parse_dom(html, 'object', req='data')
        if obj:
            streams = dict((stream_url, {
                'quality': scraper_utils.gv_get_quality(stream_url),
                'direct': True
            }) for stream_url in scraper_utils.parse_google(
                self, obj[0].attrs['data']))
        else:
            streams = scraper_utils.parse_sources_list(self, html)

        for stream_url, values in streams.iteritems():
            host = scraper_utils.get_direct_hostname(self, stream_url)
            if host == 'gvideo':
                quality = scraper_utils.gv_get_quality(stream_url)
            else:
                quality = values['quality']
                stream_url += scraper_utils.append_headers({
                    'User-Agent':
                    scraper_utils.get_ua(),
                    'Referer':
                    page_url
                })

            source = {
                'multi-part': False,
                'url': stream_url,
                'host': host,
                'class': self,
                'quality': quality,
                'views': None,
                'rating': None,
                'direct': True
            }
            hosters.append(source)

        return hosters

コード例 #4

0

ファイルを表示

ファイル: movytvy_scraper.py プロジェクト: uguer30/Project

    def get_sources(self, video):
        hosters = []
        source_url = self.get_url(video)
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        page_url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(page_url, require_debrid=True, cache_limit=.5)
        fragment = dom_parser2.parse_dom(html, 'table',
                                         {'class': 'links-table'})
        if not fragment: return hosters
        for _attrs, row in dom_parser2.parse_dom(fragment[0].content, 'tr'):
            match = re.search(
                "playVideo\.bind\(.*?'([^']+)(?:[^>]*>){2}(.*?)</td>", row,
                re.DOTALL)
            if not match: continue

            stream_url, release = match.groups()
            if scraper_utils.get_direct_hostname(self, stream_url) == 'gvideo':
                sources = scraper_utils.parse_google(self, stream_url)
            else:
                sources = [stream_url]

            for source in sources:
                host = scraper_utils.get_direct_hostname(self, source)
                if host == 'gvideo':
                    quality = scraper_utils.gv_get_quality(source)
                    direct = True
                else:
                    host = urlparse.urlparse(source).hostname
                    if video.video_type == VIDEO_TYPES.MOVIE:
                        meta = scraper_utils.parse_movie_link(release)
                    else:
                        meta = scraper_utils.parse_episode_link(release)
                    base_quality = scraper_utils.height_get_quality(
                        meta['height'])
                    quality = scraper_utils.get_quality(
                        video, host, base_quality)
                    direct = False
                hoster = {
                    'multi-part': False,
                    'host': host,
                    'class': self,
                    'quality': quality,
                    'views': None,
                    'rating': None,
                    'url': source,
                    'direct': direct
                }
                hosters.append(hoster)

        return hosters

コード例 #5

0

ファイルを表示

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        js_url = scraper_utils.urljoin(self.base_url, '/javascript/movies.js')
        html = self._http_get(js_url, cache_limit=48)
        if source_url.startswith('/'):
            source_url = source_url[1:]
        pattern = '''getElementById\(\s*"%s".*?play\(\s*'([^']+)''' % (
            source_url)
        match = re.search(pattern, html, re.I)
        if match:
            stream_url = match.group(1)
            if 'drive.google' in stream_url or 'docs.google' in stream_url:
                sources = scraper_utils.parse_google(self, stream_url)
            else:
                sources = [stream_url]

            for source in sources:
                stream_url = source + scraper_utils.append_headers(
                    {'User-Agent': scraper_utils.get_ua()})
                host = scraper_utils.get_direct_hostname(self, source)
                if host == 'gvideo':
                    quality = scraper_utils.gv_get_quality(source)
                    direct = True
                elif 'youtube' in stream_url:
                    quality = QUALITIES.HD720
                    direct = False
                    host = 'youtube.com'
                else:
                    quality = QUALITIES.HIGH
                    direct = True
                hoster = {
                    'multi-part': False,
                    'host': host,
                    'class': self,
                    'quality': quality,
                    'views': None,
                    'rating': None,
                    'url': stream_url,
                    'direct': direct
                }
                hosters.append(hoster)
        return hosters

コード例 #6

0

ファイルを表示

ファイル: moviesplanet_scraper.py プロジェクト: uguer30/Project

    def get_sources(self, video):
        source_url = self.get_url(video)
        sources = {}
        hosters = []
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(url, cache_limit=0)
        for match in re.finditer("embeds\[(\d+)\]\s*=\s*'([^']+)", html):
            match = re.search('src="([^"]+)', match.group(2))
            if match:
                iframe_url = match.group(1)
                if 'play-en.php' in iframe_url:
                    match = re.search('id=([^"&]+)', iframe_url)
                    if match:
                        proxy_link = match.group(1)
                        proxy_link = proxy_link.split('*', 1)[-1]
                        picasa_url = scraper_utils.gk_decrypt(
                            self.get_name(), GK_KEY, proxy_link)
                        for stream_url in scraper_utils.parse_google(
                                self, picasa_url):
                            sources[stream_url] = {
                                'quality':
                                scraper_utils.gv_get_quality(stream_url),
                                'direct': True
                            }
                else:
                    html = self._http_get(iframe_url, cache_limit=0)
                    temp_sources = scraper_utils.parse_sources_list(self, html)
                    for source in temp_sources:
                        if 'download.php' in source:
                            redir_html = self._http_get(source,
                                                        allow_redirect=False,
                                                        method='HEAD',
                                                        cache_limit=0)
                            if redir_html.startswith('http'):
                                temp_sources[redir_html] = temp_sources[source]
                                del temp_sources[source]
                    sources.update(temp_sources)
                    for source in dom_parser2.parse_dom(html,
                                                        'source',
                                                        {'type': 'video/mp4'},
                                                        req='src'):
                        sources[source.attrs['src']] = {
                            'quality': QUALITIES.HD720,
                            'direct': True,
                            'referer': iframe_url
                        }

        for source, values in sources.iteritems():
            host = scraper_utils.get_direct_hostname(self, source)
            headers = {'User-Agent': scraper_utils.get_ua()}
            if 'referer' in values: headers['Referer'] = values['referer']
            stream_url = source + scraper_utils.append_headers(headers)
            if host == 'gvideo':
                quality = scraper_utils.gv_get_quality(source)
            else:
                quality = values['quality']
                if quality not in Q_ORDER:
                    quality = QUALITY_MAP.get(values['quality'],
                                              QUALITIES.HIGH)

            hoster = {
                'multi-part': False,
                'url': stream_url,
                'host': host,
                'class': self,
                'quality': quality,
                'views': None,
                'rating': None,
                'direct': True
            }
            hosters.append(hoster)

        return hosters