Beispiel #1
0
class DirectDownload_Scraper(scraper.Scraper):
    base_url = BASE_URL

    def __init__(self, timeout=scraper.DEFAULT_TIMEOUT):
        self.timeout = timeout
        self.db_connection = DB_Connection()
        self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' % (self.get_name()))
        self.username = xbmcaddon.Addon().getSetting('%s-username' % (self.get_name()))
        self.password = xbmcaddon.Addon().getSetting('%s-password' % (self.get_name()))

    @classmethod
    def provides(cls):
        return frozenset([VIDEO_TYPES.EPISODE])

    @classmethod
    def get_name(cls):
        return 'DirectDownload.tv'

    def resolve_link(self, link):
        return link

    def format_source_label(self, item):
        return '[%s] (%s) %s' % (item['quality'], item['dd_qual'], item['host'])

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url:
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)
            if html:
                js_result = json.loads(html)
                query = urlparse.parse_qs(urlparse.urlparse(url).query)
                match_quality = Q_ORDER
                if 'quality' in query:
                    temp_quality = re.sub('\s', '', query['quality'][0])
                    match_quality = temp_quality.split(',')

                import urlresolver
                sxe_str = '.S%02dE%02d.' % (int(video.season), int(video.episode))
                airdate_str = video.ep_airdate.strftime('.%Y.%m.%d.')
                for result in js_result:
                    if sxe_str not in result['release'] and airdate_str not in result['release']:
                        continue
                    
                    if result['quality'] in match_quality:
                        for link in result['links']:
                            if re.search('\.rar(\.|$)', link['url']):
                                continue
                            
                            # validate url since host validation fails for real-debrid; mark links direct to avoid unusable check
                            if urlresolver.HostedMediaFile(link['url']):
                                hostname = urlparse.urlparse(link['url']).hostname
                                hoster = {'multi-part': False, 'class': self, 'views': None, 'url': link['url'], 'rating': None, 'host': hostname,
                                        'quality': QUALITY_MAP[result['quality']], 'dd_qual': result['quality'], 'direct': True}
                                hosters.append(hoster)

        return hosters

    def get_url(self, video):
        url = None
        result = self.db_connection.get_related_url(video.video_type, video.title, video.year, self.get_name(), video.season, video.episode)
        if result:
            url = result[0][0]
            log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (video.video_type, video.title, video.year, self.get_name(), url))
        else:
            date_match = False
            search_title = '%s S%02dE%02d' % (video.title, int(video.season), int(video.episode))
            results = self.search(video.video_type, search_title, '')
            if not results and video.ep_airdate is not None:
                search_title = '%s %s' % (video.title, video.ep_airdate.strftime('%Y.%m.%d'))
                results = self.search(video.video_type, search_title, '')
                date_match = True

            best_q_index = -1
            for result in results:
                if date_match and video.ep_airdate.strftime('%Y.%m.%d') not in result['title']:
                    continue
                
                if Q_DICT[result['quality']] > best_q_index:
                    best_q_index = Q_DICT[result['quality']]
                    url = result['url']
            self.db_connection.set_related_url(video.video_type, video.title, video.year, self.get_name(), url)
        return url

    @classmethod
    def get_settings(cls):
        settings = super(DirectDownload_Scraper, cls).get_settings()
        settings = cls._disable_sub_check(settings)
        name = cls.get_name()
        settings.append('         <setting id="%s-username" type="text" label="     Username" default="" visible="eq(-6,true)"/>' % (name))
        settings.append('         <setting id="%s-password" type="text" label="     Password" option="hidden" default="" visible="eq(-7,true)"/>' % (name))
        return settings

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, '/search?query=')
        search_url += title
        html = self._http_get(search_url, cache_limit=.25)
        results = []
        if html:
            js_result = json.loads(html)
            for match in js_result:
                url = search_url + '&quality=%s' % match['quality']
                result = {'url': url.replace(self.base_url, ''), 'title': match['release'], 'quality': match['quality'], 'year': ''}
                results.append(result)
        return results

    def _http_get(self, url, data=None, cache_limit=8):
        # return all uncached blank pages if no user or pass
        if not self.username or not self.password:
            return ''

        if 'search?query' in url:
            log_utils.log('Translating Search Url: %s' % (url), xbmc.LOGDEBUG)
            url = self.__translate_search(url)

        html = super(DirectDownload_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, data=data, cache_limit=cache_limit)

        fake = None
        try:
            js_result = json.loads(html)
            fake = False
            fake = js_result[0]['fake']
        except: pass

        if fake or (fake is None and not re.search(LOGOUT, html)):
            log_utils.log('Logging in for url (%s)' % (url), xbmc.LOGDEBUG)
            self.__login()
            html = super(DirectDownload_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, data=data, cache_limit=0)

        return html

    def __translate_search(self, url):
        query = urlparse.parse_qs(urlparse.urlparse(url).query)
        quality = re.sub('\s', '', query['quality'][0]) if 'quality' in query else ','.join(Q_ORDER)
        return urlparse.urljoin(self.base_url, (SEARCH_URL % (urllib.quote(query['query'][0]), quality)))

    def __login(self):
        url = self.base_url
        data = {'username': self.username, 'password': self.password, 'Login': '******'}
        html = super(DirectDownload_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, data=data, cache_limit=0)
        if not re.search(LOGOUT, html):
            raise Exception('directdownload.tv login failed')
Beispiel #2
0
class Scraper(object):
    __metaclass__ = abc.ABCMeta
    base_url = BASE_URL

    def __init__(self, timeout=DEFAULT_TIMEOUT):
        self.db_connection = DB_Connection()

    @abstractclassmethod
    def provides(cls):
        """
        Must return a list/set/frozenset of VIDEO_TYPES that are supported by this scraper. Is a class method so that instances of the class 
        don't have to be instantiated to determine they are not useful
        
        * Datatypes set or frozenset are preferred as existence checking is faster with sets
        """
        raise NotImplementedError

    @abstractclassmethod
    def get_name(cls):
        """
        Must return a string that is a name that will be used through out the UI and DB to refer to urls from this source
        Should be descriptive enough to be recognized but short enough to be presented in the UI
        """
        raise NotImplementedError

    @abc.abstractmethod
    def resolve_link(self, link):
        """
        Must return a string that is a urlresolver resolvable link given a link that this scraper supports
        
        link: a url fragment associated with this site that can be resolved to a hoster link 

        * The purpose is many streaming sites provide the actual hoster link in a separate page from link
        on the video page.
        * This method is called for the user selected source before calling urlresolver on it.
        """
        raise NotImplementedError

    @abc.abstractmethod
    def format_source_label(self, item):
        """
        Must return a string that is to be the label to be used for this source in the "Choose Source" dialog
        
        item: one element of the list that is returned from get_sources for this scraper
        """
        raise NotImplementedError

    @abc.abstractmethod
    def get_sources(self, video):
        """
        Must return a list of dictionaries that are potential link to hoster sites (or links to links to hoster sites)
        Each dictionary must contain elements of at least:
            * multi-part: True if this source is one part of a whole
            * class: a reference to an instance of the scraper itself
            * host: the hostname of the hoster
            * url: the url that is a link to a hoster, or a link to a page that this scraper can resolve to a link to a hoster
            * quality: one of the QUALITIES values, or None if unknown; users can sort sources by quality
            * views: count of the views from the site for this source or None is unknown; Users can sort sources by views
            * rating: a value between 0 and 100; 0 being worst, 100 the best, or None if unknown. Users can sort sources by rating.
            * direct: True if url is a direct link to a media file; False if not. If not present; assumption is direct 
            * other keys are allowed as needed if they would be useful (e.g. for format_source_label)
        
        video is an object of type ScraperVideo:
            video_type: one of VIDEO_TYPES for whatever the sources should be for
            title: the title of the tv show or movie
            year: the year of the tv show or movie
            season: only present for tv shows; the season number of the video for which sources are requested
            episode: only present for tv shows; the episode number of the video for which sources are requested
            ep_title: only present for tv shows; the episode title if available        
        """
        raise NotImplementedError

    @abc.abstractmethod
    def get_url(self, video):
        """
        Must return a url for the site this scraper is associated with that is related to this video.
        
        video is an object of type ScraperVideo:
            video_type: one of VIDEO_TYPES this url is for (e.g. EPISODE urls might be different than TVSHOW urls)
            title: the title of the tv show or movie
            year: the year of the tv show or movie
            season: only present for season or episode VIDEO_TYPES; the season number for the url being requested
            episode: only present for season or episode VIDEO_TYPES; the episode number for the url being requested
            ep_title: only present for tv shows; the episode title if available        
        
        * Generally speaking, domain should not be included
        """
        raise NotImplementedError

    def _default_get_url(self, video):
        temp_video_type = video.video_type
        if video.video_type == VIDEO_TYPES.EPISODE:
            temp_video_type = VIDEO_TYPES.TVSHOW
        url = None

        result = self.db_connection.get_related_url(temp_video_type,
                                                    video.title, video.year,
                                                    self.get_name())
        if result:
            url = result[0][0]
            log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' %
                          (temp_video_type, video.title, video.year,
                           self.get_name(), url))
        else:
            results = self.search(temp_video_type, video.title, video.year)
            if results:
                url = results[0]['url']
                self.db_connection.set_related_url(temp_video_type,
                                                   video.title, video.year,
                                                   self.get_name(), url)

        if url and video.video_type == VIDEO_TYPES.EPISODE:
            result = self.db_connection.get_related_url(
                VIDEO_TYPES.EPISODE, video.title, video.year, self.get_name(),
                video.season, video.episode)
            if result:
                url = result[0][0]
                log_utils.log('Got local related url: |%s|%s|%s|' %
                              (video, self.get_name(), url))
            else:
                show_url = url
                url = self._get_episode_url(show_url, video)
                if url:
                    self.db_connection.set_related_url(VIDEO_TYPES.EPISODE,
                                                       video.title, video.year,
                                                       self.get_name(), url,
                                                       video.season,
                                                       video.episode)

        return url

    @abc.abstractmethod
    def search(self, video_type, title, year):
        """
        Must return a list of results returned from the site associated with this scraper when doing a search using the input parameters
        
        If it does return results, it must be a list of dictionaries. Each dictionary must contain at least the following:
            * title: title of the result
            * year: year of the result
            * url: a url fragment that is the url on the site associated with this scraper for this season result item
        
        video_type: one of the VIDEO_TYPES being searched for. Only tvshows and movies are expected generally
        title: the title being search for
        year: the year being search for
        
        * Method must be provided, but can raise NotImplementedError if search not available on the site
        """
        raise NotImplementedError

    @classmethod
    def get_settings(cls):
        """
        Returns a list of settings to be used for this scraper. Settings are automatically checked for updates every time scrapers are imported
        The list returned by each scraper is aggregated into a big settings.xml string, and then if it differs from the current settings xml in the Scrapers category
        the existing settings.xml fragment is removed and replaced by the new string
        """
        name = cls.get_name()
        return [
            '         <setting id="%s-enable" type="bool" label="%s Enabled" default="true" visible="true"/>'
            % (name, name),
            '         <setting id="%s-base_url" type="text" label="     Base Url" default="%s" visible="eq(-1,true)"/>'
            % (name, cls.base_url),
            '         <setting id="%s-sub_check" type="bool" label="     Include in Page Existence checks?" default="true" visible="eq(-2,true)"/>'
            % (name),
            '         <setting id="%s_try" type="number" default="0" visible="false"/>'
            % (name),
            '         <setting id="%s_fail" type="number" default="0" visible="false"/>'
            % (name),
            '         <setting id="%s_check" type="number" default="0" visible="false"/>'
            % (name),
        ]

    @classmethod
    def _disable_sub_check(cls, settings):
        for i in reversed(xrange(len(settings))):
            if 'sub_check' in settings[i]:
                settings[i] = settings[i].replace('default="true"',
                                                  'default="false"')
        return settings

    def _cached_http_get(self,
                         url,
                         base_url,
                         timeout,
                         cookies=None,
                         data=None,
                         headers=None,
                         cache_limit=8):
        if cookies is None: cookies = {}
        if timeout == 0: timeout = None
        if headers is None: headers = {}
        referer = headers['Referer'] if 'Referer' in headers else url
        log_utils.log(
            'Getting Url: %s cookie=|%s| data=|%s| extra headers=|%s|' %
            (url, cookies, data, headers))
        db_connection = DB_Connection()
        _, html = db_connection.get_cached_url(url, cache_limit)
        if html:
            log_utils.log('Returning cached result for: %s' % (url),
                          xbmc.LOGDEBUG)
            return html

        try:
            cj = self._set_cookies(base_url, cookies)
            if data is not None: data = urllib.urlencode(data, True)
            request = urllib2.Request(url, data=data)
            request.add_header('User-Agent', USER_AGENT)
            request.add_unredirected_header('Host', request.get_host())
            request.add_unredirected_header('Referer', referer)
            for key in headers:
                request.add_header(key, headers[key])
            response = urllib2.urlopen(request, timeout=timeout)
            cj.save(ignore_discard=True, ignore_expires=True)
            if response.info().get('Content-Encoding') == 'gzip':
                buf = StringIO(response.read())
                f = gzip.GzipFile(fileobj=buf)
                html = f.read()
            else:
                html = response.read()
        except Exception as e:
            log_utils.log(
                'Error (%s) during scraper http get: %s' % (str(e), url),
                xbmc.LOGWARNING)
            return ''

        db_connection.cache_url(url, html)
        return html

    def _set_cookies(self, base_url, cookies):
        domain = urlparse.urlsplit(base_url).hostname
        cookie_file = os.path.join(COOKIEPATH,
                                   '%s_cookies.lwp' % (self.get_name()))
        cj = cookielib.LWPCookieJar(cookie_file)
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
        urllib2.install_opener(opener)
        for key in cookies:
            c = cookielib.Cookie(0,
                                 key,
                                 cookies[key],
                                 port=None,
                                 port_specified=False,
                                 domain=domain,
                                 domain_specified=True,
                                 domain_initial_dot=False,
                                 path='/',
                                 path_specified=True,
                                 secure=False,
                                 expires=None,
                                 discard=False,
                                 comment=None,
                                 comment_url=None,
                                 rest={})
            cj.set_cookie(c)
        try:
            cj.load(ignore_discard=True)
        except:
            pass
        return cj

    def _do_recaptcha(self, key, tries=None, max_tries=None):
        challenge_url = CAPTCHA_BASE_URL + '/challenge?k=%s' % (key)
        html = self._cached_http_get(challenge_url,
                                     CAPTCHA_BASE_URL,
                                     timeout=DEFAULT_TIMEOUT,
                                     cache_limit=0)
        match = re.search("challenge\s+\:\s+'([^']+)", html)
        captchaimg = 'http://www.google.com/recaptcha/api/image?c=%s' % (
            match.group(1))
        img = xbmcgui.ControlImage(450, 0, 400, 130, captchaimg)
        wdlg = xbmcgui.WindowDialog()
        wdlg.addControl(img)
        wdlg.show()
        header = 'Type the words in the image'
        if tries and max_tries:
            header += ' (Try: %s/%s)' % (tries, max_tries)
        kb = xbmc.Keyboard('', header, False)
        kb.doModal()
        solution = ''
        if kb.isConfirmed():
            solution = kb.getText()
            if not solution:
                raise Exception(
                    'You must enter text in the image to access video')
        wdlg.close()
        return {
            'recaptcha_challenge_field': match.group(1),
            'recaptcha_response_field': solution
        }

    def _default_get_episode_url(self,
                                 show_url,
                                 video,
                                 episode_pattern,
                                 title_pattern=''):
        log_utils.log(
            'Default Episode Url: |%s|%s|%s|' %
            (self.base_url, show_url, str(video).decode('utf-8', 'replace')),
            xbmc.LOGDEBUG)
        url = urlparse.urljoin(self.base_url, show_url)
        html = self._http_get(url, cache_limit=2)
        if html:
            force_title = self._force_title(video)

            if not force_title:
                match = re.search(episode_pattern, html, re.DOTALL)
                if match:
                    url = match.group(1)
                    return url.replace(self.base_url, '')
            else:
                log_utils.log(
                    'Skipping S&E matching as title search is forced on: %s' %
                    (video.slug), xbmc.LOGDEBUG)

            if (force_title or xbmcaddon.Addon().getSetting('title-fallback')
                    == 'true') and video.ep_title and title_pattern:
                norm_title = self._normalize_title(video.ep_title)
                for match in re.finditer(title_pattern, html,
                                         re.DOTALL | re.I):
                    url, title = match.groups()
                    if norm_title == self._normalize_title(title):
                        return url.replace(self.base_url, '')

    def _force_title(self, video):
        slug_str = xbmcaddon.Addon().getSetting('force_title_match')
        slug_list = slug_str.split('|') if slug_str else []
        return video.slug in slug_list

    def _normalize_title(self, title):
        new_title = title.upper()
        new_title = re.sub('\W', '', new_title)
        #log_utils.log('In title: |%s| Out title: |%s|' % (title,new_title), xbmc.LOGDEBUG)
        return new_title

    def _blog_get_quality(self, video, q_str, host):
        """
        Use the q_str to determine the post quality; then use the host to determine host quality
        allow the host to drop the quality but not increase it
        """
        q_str.replace(video.title, '')
        q_str.replace(str(video.year), '')
        q_str = q_str.upper()

        # Assume movies are low quality, tv shows are high quality
        if video.video_type == VIDEO_TYPES.MOVIE:
            quality = QUALITIES.LOW
        else:
            quality = QUALITIES.HIGH

        post_quality = quality
        for key in Q_LIST:
            if any(q in q_str for q in BLOG_Q_MAP[key]):
                post_quality = key

        host_quality = None
        if host:
            for key in HOST_Q:
                if any(host in hostname for hostname in HOST_Q[key]):
                    host_quality = key

        #log_utils.log('q_str: %s, host: %s, post q: %s, host q: %s' % (q_str, host, post_quality, host_quality), xbmc.LOGDEBUG)
        if host_quality is not None and Q_ORDER[host_quality] < Q_ORDER[
                post_quality]:
            quality = host_quality
        else:
            quality = post_quality

        return quality
class OneClickWatch_Scraper(scraper.Scraper):
    base_url=BASE_URL
    def __init__(self, timeout=scraper.DEFAULT_TIMEOUT):
        self.timeout=timeout
        self.db_connection = DB_Connection()
        self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' % (self.get_name()))
    
    @classmethod
    def provides(cls):
        return frozenset([VIDEO_TYPES.MOVIE, VIDEO_TYPES.EPISODE])
    
    @classmethod
    def get_name(cls):
        return 'OneClickWatch'
    
    def resolve_link(self, link):
        return link

    def format_source_label(self, item):
        return '[%s] %s (%s/100)' % (item['quality'], item['host'], item['rating'])
    
    def get_sources(self, video):
        source_url= self.get_url(video)
        hosters=[]
        if source_url:
            url = urlparse.urljoin(self.base_url,source_url)
            html = self._http_get(url, cache_limit=.5)

            q_str = ''
            match = re.search('class="title">([^<]+)', html)
            if match:
                q_str = match.group(1)
                
            pattern = '^<a\s+href="([^"]+)"\s+rel="nofollow"'
            for match in re.finditer(pattern, html, re.M):
                url=match.group(1)
                hoster={'multi-part': False, 'class': self, 'views': None, 'url': url, 'rating': None, 'direct': False}
                hoster['host']=urlparse.urlsplit(url).hostname
                hoster['quality']=self._blog_get_quality(video, q_str, hoster['host'])
                hosters.append(hoster)

        return hosters
    
    def get_url(self, video):
        url = None
        result = self.db_connection.get_related_url(video.video_type, video.title, video.year, self.get_name(), video.season, video.episode)
        if result:
            url=result[0][0]
            log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (video.video_type, video.title, video.year, self.get_name(), url))
        else:
            select = int(xbmcaddon.Addon().getSetting('%s-select' % (self.get_name())))
            if video.video_type == VIDEO_TYPES.EPISODE:
                search_title = '%s S%02dE%02d' % (video.title, int(video.season), int(video.episode))
            else:
                search_title = '%s %s' % (video.title, video.year)
            results = self.search(video.video_type, search_title, video.year)
            if results:
                if select == 0:
                    best_result = results[0]
                else:
                    best_qorder=0
                    best_qstr=''
                    for result in results:
                        match = re.search('\[(.*)\]$', result['title'])
                        if match:
                            q_str = match.group(1)
                            quality=self._blog_get_quality(video, q_str, '')
                            #print 'result: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality])
                            if Q_ORDER[quality]>=best_qorder:
                                if Q_ORDER[quality] > best_qorder or (quality == QUALITIES.HD and '1080' in q_str and '1080' not in best_qstr):
                                    #print 'Setting best as: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality])
                                    best_qstr = q_str
                                    best_result=result
                                    best_qorder = Q_ORDER[quality]
                            
                url = best_result['url']
                self.db_connection.set_related_url(video.video_type, video.title, video.year, self.get_name(), url)
        return url

    @classmethod
    def get_settings(cls):
        settings = super(OneClickWatch_Scraper, cls).get_settings()
        settings = cls._disable_sub_check(settings)
        name=cls.get_name()
        settings.append('         <setting id="%s-filter" type="slider" range="0,180" option="int" label="     Filter results older than (0=No Filter) (days)" default="30" visible="eq(-6,true)"/>' % (name))
        settings.append('         <setting id="%s-select" type="enum" label="     Automatically Select (Movies only)" values="Most Recent|Highest Quality" default="0" visible="eq(-7,true)"/>' % (name))
        return settings

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, '/?s=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=.25)
        results=[]
        filter_days = datetime.timedelta(days=int(xbmcaddon.Addon().getSetting('%s-filter' % (self.get_name()))))
        today = datetime.date.today()
        pattern ='class="title"><a href="([^"]+)[^>]+>([^<]+).*?rel="bookmark">([^<]+)'
        for match in re.finditer(pattern, html, re.DOTALL):
            url, title, date_str  = match.groups('')
            if filter_days:
                try: post_date = datetime.datetime.strptime(date_str, '%B %d, %Y').date()
                except TypeError: post_date = datetime.datetime(*(time.strptime(date_str, '%B %d, %Y')[0:6])).date()
                if today - post_date > filter_days:
                    continue

            match_year = ''
            if video_type == VIDEO_TYPES.MOVIE:
                match = re.search('(.*?)\s*[\[(]?(\d{4})[)\]]?\s*(.*)', title)
                if match:
                    title, match_year, extra_title = match.groups()
                    title = '%s [%s]' % (title, extra_title)
            else:
                match_year = ''
                match = re.search('(.*?)\s*S\d+E\d+\s*(.*)', title)
                if match:
                    title, extra_title = match.groups()
                    title = '%s [%s]' % (title, extra_title)
                                
            if not year or not match_year or year == match_year:
                result={'url': url.replace(self.base_url,''), 'title': title, 'year': match_year}
                results.append(result)
        return results

    def _http_get(self, url, cache_limit=8):
        return super(OneClickWatch_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, cache_limit=cache_limit)
Beispiel #4
0
class OneClickWatch_Scraper(scraper.Scraper):
    base_url = BASE_URL

    def __init__(self, timeout=scraper.DEFAULT_TIMEOUT):
        self.timeout = timeout
        self.db_connection = DB_Connection()
        self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' %
                                                     (self.get_name()))

    @classmethod
    def provides(cls):
        return frozenset([VIDEO_TYPES.MOVIE, VIDEO_TYPES.EPISODE])

    @classmethod
    def get_name(cls):
        return 'OneClickWatch'

    def resolve_link(self, link):
        return link

    def format_source_label(self, item):
        return '[%s] %s (%s/100)' % (item['quality'], item['host'],
                                     item['rating'])

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url:
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)

            q_str = ''
            match = re.search('class="title">([^<]+)', html)
            if match:
                q_str = match.group(1)

            pattern = '^<a\s+href="([^"]+)"\s+rel="nofollow"'
            for match in re.finditer(pattern, html, re.M):
                url = match.group(1)
                hoster = {
                    'multi-part': False,
                    'class': self,
                    'views': None,
                    'url': url,
                    'rating': None,
                    'direct': False
                }
                hoster['host'] = urlparse.urlsplit(url).hostname
                hoster['quality'] = self._blog_get_quality(
                    video, q_str, hoster['host'])
                hosters.append(hoster)

        return hosters

    def get_url(self, video):
        url = None
        result = self.db_connection.get_related_url(video.video_type,
                                                    video.title, video.year,
                                                    self.get_name(),
                                                    video.season,
                                                    video.episode)
        if result:
            url = result[0][0]
            log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' %
                          (video.video_type, video.title, video.year,
                           self.get_name(), url))
        else:
            select = int(xbmcaddon.Addon().getSetting('%s-select' %
                                                      (self.get_name())))
            if video.video_type == VIDEO_TYPES.EPISODE:
                if not self._force_title(video):
                    search_title = '%s S%02dE%02d' % (
                        video.title, int(video.season), int(video.episode))
                else:
                    if not video.ep_title: return None
                    search_title = '%s %s' % (video.title, video.ep_title)
            else:
                search_title = '%s %s' % (video.title, video.year)
            results = self.search(video.video_type, search_title, video.year)
            if results:
                if select == 0:
                    best_result = results[0]
                else:
                    best_qorder = 0
                    best_qstr = ''
                    for result in results:
                        match = re.search('\[(.*)\]$', result['title'])
                        if match:
                            q_str = match.group(1)
                            quality = self._blog_get_quality(video, q_str, '')
                            #print 'result: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality])
                            if Q_ORDER[quality] >= best_qorder:
                                if Q_ORDER[quality] > best_qorder or (
                                        quality == QUALITIES.HD
                                        and '1080' in q_str
                                        and '1080' not in best_qstr):
                                    #print 'Setting best as: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality])
                                    best_qstr = q_str
                                    best_result = result
                                    best_qorder = Q_ORDER[quality]

                url = best_result['url']
                self.db_connection.set_related_url(video.video_type,
                                                   video.title, video.year,
                                                   self.get_name(), url)
        return url

    @classmethod
    def get_settings(cls):
        settings = super(OneClickWatch_Scraper, cls).get_settings()
        settings = cls._disable_sub_check(settings)
        name = cls.get_name()
        settings.append(
            '         <setting id="%s-filter" type="slider" range="0,180" option="int" label="     Filter results older than (0=No Filter) (days)" default="30" visible="eq(-6,true)"/>'
            % (name))
        settings.append(
            '         <setting id="%s-select" type="enum" label="     Automatically Select (Movies only)" values="Most Recent|Highest Quality" default="0" visible="eq(-7,true)"/>'
            % (name))
        return settings

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, '/?s=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=.25)
        results = []
        filter_days = datetime.timedelta(
            days=int(xbmcaddon.Addon().getSetting('%s-filter' %
                                                  (self.get_name()))))
        today = datetime.date.today()
        pattern = 'class="title"><a href="([^"]+)[^>]+>([^<]+).*?rel="bookmark">([^<]+)'
        for match in re.finditer(pattern, html, re.DOTALL):
            url, title, date_str = match.groups('')
            if filter_days:
                try:
                    post_date = datetime.datetime.strptime(
                        date_str, '%B %d, %Y').date()
                except TypeError:
                    post_date = datetime.datetime(
                        *(time.strptime(date_str, '%B %d, %Y')[0:6])).date()
                if today - post_date > filter_days:
                    continue

            match_year = ''
            if video_type == VIDEO_TYPES.MOVIE:
                match = re.search('(.*?)\s*[\[(]?(\d{4})[)\]]?\s*(.*)', title)
                if match:
                    title, match_year, extra_title = match.groups()
                    title = '%s [%s]' % (title, extra_title)
            else:
                match_year = ''
                match = re.search('(.*?)\s*S\d+E\d+\s*(.*)', title)
                if match:
                    title, extra_title = match.groups()
                    title = '%s [%s]' % (title, extra_title)

            if not year or not match_year or year == match_year:
                result = {
                    'url': url.replace(self.base_url, ''),
                    'title': title,
                    'year': match_year
                }
                results.append(result)
        return results

    def _http_get(self, url, cache_limit=8):
        return super(OneClickWatch_Scraper,
                     self)._cached_http_get(url,
                                            self.base_url,
                                            self.timeout,
                                            cache_limit=cache_limit)
Beispiel #5
0
class MyVidLinks_Scraper(scraper.Scraper):
    base_url = BASE_URL

    def __init__(self, timeout=scraper.DEFAULT_TIMEOUT):
        self.timeout = timeout
        self.db_connection = DB_Connection()
        self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' %
                                                     (self.get_name()))

    @classmethod
    def provides(cls):
        return frozenset([VIDEO_TYPES.MOVIE, VIDEO_TYPES.EPISODE])

    @classmethod
    def get_name(cls):
        return 'MyVideoLinks.eu'

    def resolve_link(self, link):
        return link

    def format_source_label(self, item):
        return '[%s] %s (%s Views) (%s/100)' % (item['quality'], item['host'],
                                                item['views'], item['rating'])

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url:
            self.__fix_base_url(video.video_type)
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)

            views = None
            pattern = '<span[^>]+>(\d+)\s+Views'
            match = re.search(pattern, html)
            if match:
                views = int(match.group(1))

            if video.video_type == VIDEO_TYPES.MOVIE:
                return self.__get_movie_links(video, views, html)
            else:
                return self.__get_episode_links(video, views, html)
        return hosters

    def __get_movie_links(self, video, views, html):
        pattern = 'rel="bookmark"\s+title="Permanent Link to ([^"]+)'
        match = re.search(pattern, html)
        q_str = ''
        if match:
            q_str = match.group(1)

        return self.__get_links(video, views, html, q_str)

    def __get_episode_links(self, video, views, html):
        pattern = '<h4>(.*?)</h4>(.*?)</ul>'
        hosters = []
        for match in re.finditer(pattern, html, re.DOTALL):
            q_str, fragment = match.groups()
            hosters += self.__get_links(video, views, fragment, q_str)
        return hosters

    def __get_links(self, video, views, html, q_str):
        pattern = 'li>\s*<a\s+href="(http[^"]+)'
        hosters = []
        for match in re.finditer(pattern, html):
            url = match.group(1)
            hoster = {
                'multi-part': False,
                'class': self,
                'views': views,
                'url': url,
                'rating': None,
                'quality': None,
                'direct': False
            }
            hoster['host'] = urlparse.urlsplit(url).hostname
            hoster['quality'] = self._blog_get_quality(video, q_str,
                                                       hoster['host'])
            hosters.append(hoster)
        return hosters

    def __fix_base_url(self, video_type):
        if video_type == VIDEO_TYPES.MOVIE:
            if not self.base_url.startswith('http://movies.'):
                self.base_url = self.base_url.replace('http://',
                                                      'http://movies.')
        else:
            if not self.base_url.startswith('http://tv.'):
                self.base_url = self.base_url.replace('http://', 'http://tv.')

    def get_url(self, video):
        url = None
        result = self.db_connection.get_related_url(video.video_type,
                                                    video.title, video.year,
                                                    self.get_name(),
                                                    video.season,
                                                    video.episode)
        if result:
            url = result[0][0]
            log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' %
                          (video.video_type, video.title, video.year,
                           self.get_name(), url))
        else:
            select = int(xbmcaddon.Addon().getSetting('%s-select' %
                                                      (self.get_name())))
            if video.video_type == VIDEO_TYPES.EPISODE:
                if not self._force_title(video):
                    search_title = '%s S%02dE%02d' % (
                        video.title, int(video.season), int(video.episode))
                else:
                    if not video.ep_title: return None
                    search_title = '%s %s' % (video.title, video.ep_title)
            else:
                search_title = '%s %s' % (video.title, video.year)
            results = self.search(video.video_type, search_title, video.year)
            if results:
                # episodes don't tell us the quality on the search screen so just return the 1st result
                if select == 0 or video.video_type == VIDEO_TYPES.EPISODE:
                    best_result = results[0]
                else:
                    best_qorder = 0
                    best_qstr = ''
                    for result in results:
                        match = re.search('\[(.*)\]$', result['title'])
                        if match:
                            q_str = match.group(1)
                            quality = self._blog_get_quality(video, q_str, '')
                            #print 'result: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality])
                            if Q_ORDER[quality] >= best_qorder:
                                if Q_ORDER[quality] > best_qorder or (
                                        quality == QUALITIES.HD
                                        and '1080' in q_str
                                        and '1080' not in best_qstr):
                                    #print 'Setting best as: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality])
                                    best_qstr = q_str
                                    best_result = result
                                    best_qorder = Q_ORDER[quality]

                url = best_result['url']
                self.db_connection.set_related_url(video.video_type,
                                                   video.title, video.year,
                                                   self.get_name(), url)
        return url

    @classmethod
    def get_settings(cls):
        settings = super(MyVidLinks_Scraper, cls).get_settings()
        settings = cls._disable_sub_check(settings)
        name = cls.get_name()
        settings.append(
            '         <setting id="%s-filter" type="slider" range="0,180" option="int" label="     Filter results older than (0=No Filter) (days)" default="30" visible="eq(-6,true)"/>'
            % (name))
        settings.append(
            '         <setting id="%s-select" type="enum" label="     Automatically Select (Movies only)" values="Most Recent|Highest Quality" default="0" visible="eq(-7,true)"/>'
            % (name))
        return settings

    def search(self, video_type, title, year):
        self.__fix_base_url(video_type)
        search_url = urlparse.urljoin(self.base_url, '/?s=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=.25)
        results = []
        filter_days = datetime.timedelta(
            days=int(xbmcaddon.Addon().getSetting('%s-filter' %
                                                  (self.get_name()))))
        today = datetime.date.today()
        pattern = '<h4>\s*<a\s+href="([^"]+)"\s+rel="bookmark"\s+title="([^"]+)'
        for match in re.finditer(pattern, html, re.DOTALL):
            url, title = match.groups('')

            if filter_days:
                match = re.search('/(\d{4})/(\d{2})/(\d{2})/', url)
                if match:
                    year, month, day = match.groups()
                    post_date = datetime.date(int(year), int(month), int(day))
                    if today - post_date > filter_days:
                        continue

            match_year = ''
            title = title.replace('&#8211;', '-')
            title = title.replace('&#8217;', "'")
            if video_type == VIDEO_TYPES.MOVIE:
                match = re.search('(.*?)\s*[\[(]?(\d{4})[)\]]?\s*(.*)', title)
                if match:
                    title, match_year, extra_title = match.groups()
                    title = '%s [%s]' % (title, extra_title)

            if not year or not match_year or year == match_year:
                result = {
                    'url': url.replace(self.base_url, ''),
                    'title': title,
                    'year': match_year
                }
                results.append(result)
        return results

    def _http_get(self, url, cache_limit=8):
        return super(MyVidLinks_Scraper,
                     self)._cached_http_get(url,
                                            self.base_url,
                                            self.timeout,
                                            cache_limit=cache_limit)
Beispiel #6
0
class Scraper(object):
    __metaclass__ = abc.ABCMeta
    base_url = BASE_URL

    def __init__(self, timeout=DEFAULT_TIMEOUT):
        self.db_connection = DB_Connection()

    @abstractclassmethod
    def provides(cls):
        """
        Must return a list/set/frozenset of VIDEO_TYPES that are supported by this scraper. Is a class method so that instances of the class
        don't have to be instantiated to determine they are not useful

        * Datatypes set or frozenset are preferred as existence checking is faster with sets
        """
        raise NotImplementedError

    @abstractclassmethod
    def get_name(cls):
        """
        Must return a string that is a name that will be used through out the UI and DB to refer to urls from this source
        Should be descriptive enough to be recognized but short enough to be presented in the UI
        """
        raise NotImplementedError

    @abc.abstractmethod
    def resolve_link(self, link):
        """
        Must return a string that is a urlresolver resolvable link given a link that this scraper supports

        link: a url fragment associated with this site that can be resolved to a hoster link

        * The purpose is many streaming sites provide the actual hoster link in a separate page from link
        on the video page.
        * This method is called for the user selected source before calling urlresolver on it.
        """
        raise NotImplementedError

    @abc.abstractmethod
    def format_source_label(self, item):
        """
        Must return a string that is to be the label to be used for this source in the "Choose Source" dialog

        item: one element of the list that is returned from get_sources for this scraper
        """
        raise NotImplementedError

    @abc.abstractmethod
    def get_sources(self, video):
        """
        Must return a list of dictionaries that are potential link to hoster sites (or links to links to hoster sites)
        Each dictionary must contain elements of at least:
            * multi-part: True if this source is one part of a whole
            * class: a reference to an instance of the scraper itself
            * host: the hostname of the hoster
            * url: the url that is a link to a hoster, or a link to a page that this scraper can resolve to a link to a hoster
            * quality: one of the QUALITIES values, or None if unknown; users can sort sources by quality
            * views: count of the views from the site for this source or None is unknown; Users can sort sources by views
            * rating: a value between 0 and 100; 0 being worst, 100 the best, or None if unknown. Users can sort sources by rating.
            * direct: True if url is a direct link to a media file; False if not. If not present; assumption is direct
            * other keys are allowed as needed if they would be useful (e.g. for format_source_label)

        video is an object of type ScraperVideo:
            video_type: one of VIDEO_TYPES for whatever the sources should be for
            title: the title of the tv show or movie
            year: the year of the tv show or movie
            season: only present for tv shows; the season number of the video for which sources are requested
            episode: only present for tv shows; the episode number of the video for which sources are requested
            ep_title: only present for tv shows; the episode title if available
        """
        raise NotImplementedError

    @abc.abstractmethod
    def get_url(self, video):
        """
        Must return a url for the site this scraper is associated with that is related to this video.

        video is an object of type ScraperVideo:
            video_type: one of VIDEO_TYPES this url is for (e.g. EPISODE urls might be different than TVSHOW urls)
            title: the title of the tv show or movie
            year: the year of the tv show or movie
            season: only present for season or episode VIDEO_TYPES; the season number for the url being requested
            episode: only present for season or episode VIDEO_TYPES; the episode number for the url being requested
            ep_title: only present for tv shows; the episode title if available

        * Generally speaking, domain should not be included
        """
        raise NotImplementedError

    @abc.abstractmethod
    def search(self, video_type, title, year):
        """
        Must return a list of results returned from the site associated with this scraper when doing a search using the input parameters

        If it does return results, it must be a list of dictionaries. Each dictionary must contain at least the following:
            * title: title of the result
            * year: year of the result
            * url: a url fragment that is the url on the site associated with this scraper for this season result item

        video_type: one of the VIDEO_TYPES being searched for. Only tvshows and movies are expected generally
        title: the title being search for
        year: the year being search for

        * Method must be provided, but can raise NotImplementedError if search not available on the site
        """
        raise NotImplementedError

    @classmethod
    def get_settings(cls):
        """
        Returns a list of settings to be used for this scraper. Settings are automatically checked for updates every time scrapers are imported
        The list returned by each scraper is aggregated into a big settings.xml string, and then if it differs from the current settings xml in the Scrapers category
        the existing settings.xml fragment is removed and replaced by the new string
        """
        name = cls.get_name()
        return ['         <setting id="%s-enable" type="bool" label="%s Enabled" default="true" visible="true"/>' % (name, name),
                    '         <setting id="%s-base_url" type="text" label="     Base Url" default="%s" visible="eq(-1,true)"/>' % (name, cls.base_url),
                    '         <setting id="%s-sub_check" type="bool" label="     Include in Page Existence checks?" default="true" visible="eq(-2,true)"/>' % (name),
                    '         <setting id="%s_try" type="number" default="0" visible="false"/>' % (name),
                    '         <setting id="%s_fail" type="number" default="0" visible="false"/>' % (name),
                    '         <setting id="%s_check" type="number" default="0" visible="false"/>' % (name), ]

    @classmethod
    def _disable_sub_check(cls, settings):
        for i in reversed(xrange(len(settings))):
            if 'sub_check' in settings[i]:
                settings[i] = settings[i].replace('default="true"', 'default="false"')
        return settings

    def _default_get_url(self, video):
        temp_video_type = video.video_type
        if video.video_type == VIDEO_TYPES.EPISODE: temp_video_type = VIDEO_TYPES.TVSHOW
        url = None

        result = self.db_connection.get_related_url(temp_video_type, video.title, video.year, self.get_name())
        if result:
            url = result[0][0]
            log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (temp_video_type, video.title, video.year, self.get_name(), url))
        else:
            results = self.search(temp_video_type, video.title, video.year)
            if results:
                url = results[0]['url']
                self.db_connection.set_related_url(temp_video_type, video.title, video.year, self.get_name(), url)

        if url and video.video_type == VIDEO_TYPES.EPISODE:
            result = self.db_connection.get_related_url(VIDEO_TYPES.EPISODE, video.title, video.year, self.get_name(), video.season, video.episode)
            if result:
                url = result[0][0]
                log_utils.log('Got local related url: |%s|%s|%s|' % (video, self.get_name(), url))
            else:
                show_url = url
                url = self._get_episode_url(show_url, video)
                if url:
                    self.db_connection.set_related_url(VIDEO_TYPES.EPISODE, video.title, video.year, self.get_name(), url, video.season, video.episode)

        return url

    def _cached_http_get(self, url, base_url, timeout, cookies=None, data=None, headers=None, cache_limit=8):
        if cookies is None: cookies = {}
        if timeout == 0: timeout = None
        if headers is None: headers = {}
        referer = headers['Referer'] if 'Referer' in headers else url
        log_utils.log('Getting Url: %s cookie=|%s| data=|%s| extra headers=|%s|' % (url, cookies, data, headers))
        db_connection = DB_Connection()
        _, html = db_connection.get_cached_url(url, cache_limit)
        if html:
            log_utils.log('Returning cached result for: %s' % (url), xbmc.LOGDEBUG)
            return html

        try:
            cj = self._set_cookies(base_url, cookies)
            if data is not None: data = urllib.urlencode(data, True)
            request = urllib2.Request(url, data=data)
            request.add_header('User-Agent', USER_AGENT)
            request.add_unredirected_header('Host', request.get_host())
            request.add_unredirected_header('Referer', referer)
            for key in headers: request.add_header(key, headers[key])
            response = urllib2.urlopen(request, timeout=timeout)
            cj.save(ignore_discard=True, ignore_expires=True)
            if response.info().get('Content-Encoding') == 'gzip':
                buf = StringIO(response.read())
                f = gzip.GzipFile(fileobj=buf)
                html = f.read()
            else:
                html = response.read()
        except Exception as e:
            log_utils.log('Error (%s) during scraper http get: %s' % (str(e), url), xbmc.LOGWARNING)
            return ''

        db_connection.cache_url(url, html)
        return html

    def _set_cookies(self, base_url, cookies):
        domain = urlparse.urlsplit(base_url).hostname
        cookie_file = os.path.join(COOKIEPATH, '%s_cookies.lwp' % (self.get_name()))
        cj = cookielib.LWPCookieJar(cookie_file)
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
        urllib2.install_opener(opener)
        for key in cookies:
            c = cookielib.Cookie(0, key, cookies[key], port=None, port_specified=False, domain=domain, domain_specified=True,
                                domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=False, comment=None,
                                comment_url=None, rest={})
            cj.set_cookie(c)
        try: cj.load(ignore_discard=True)
        except: pass
        return cj

    def _do_recaptcha(self, key, tries=None, max_tries=None):
        challenge_url = CAPTCHA_BASE_URL + '/challenge?k=%s' % (key)
        html = self._cached_http_get(challenge_url, CAPTCHA_BASE_URL, timeout=DEFAULT_TIMEOUT, cache_limit=0)
        match = re.search("challenge\s+\:\s+'([^']+)", html)
        captchaimg = 'http://www.google.com/recaptcha/api/image?c=%s' % (match.group(1))
        img = xbmcgui.ControlImage(450, 0, 400, 130, captchaimg)
        wdlg = xbmcgui.WindowDialog()
        wdlg.addControl(img)
        wdlg.show()
        header = 'Type the words in the image'
        if tries and max_tries:
            header += ' (Try: %s/%s)' % (tries, max_tries)
        kb = xbmc.Keyboard('', header, False)
        kb.doModal()
        solution = ''
        if kb.isConfirmed():
            solution = kb.getText()
            if not solution:
                raise Exception('You must enter text in the image to access video')
        wdlg.close()
        return {'recaptcha_challenge_field': match.group(1), 'recaptcha_response_field': solution}

    def _default_get_episode_url(self, show_url, video, episode_pattern, title_pattern='', airdate_pattern=''):
        log_utils.log('Default Episode Url: |%s|%s|%s|' % (self.base_url, show_url, str(video).decode('utf-8', 'replace')), xbmc.LOGDEBUG)
        url = urlparse.urljoin(self.base_url, show_url)
        html = self._http_get(url, cache_limit=2)
        if html:
            force_title = self._force_title(video)

            if not force_title:
                match = re.search(episode_pattern, html, re.DOTALL)
                if match:
                    url = match.group(1)
                    return url.replace(self.base_url, '')

                if xbmcaddon.Addon().getSetting('airdate-fallback') == 'true' and airdate_pattern and video.ep_airdate:
                    airdate_pattern = airdate_pattern.replace('{year}', str(video.ep_airdate.year))
                    airdate_pattern = airdate_pattern.replace('{month}', str(video.ep_airdate.month))
                    airdate_pattern = airdate_pattern.replace('{p_month}', '%02d' % (video.ep_airdate.month))
                    airdate_pattern = airdate_pattern.replace('{month_name}', MONTHS[video.ep_airdate.month - 1])
                    airdate_pattern = airdate_pattern.replace('{short_month}', SHORT_MONS[video.ep_airdate.month - 1])
                    airdate_pattern = airdate_pattern.replace('{day}', str(video.ep_airdate.day))
                    airdate_pattern = airdate_pattern.replace('{p_day}', '%02d' % (video.ep_airdate.day))
                    log_utils.log('Air Date Pattern: %s' % (airdate_pattern), xbmc.LOGDEBUG)

                    match = re.search(airdate_pattern, html, re.DOTALL | re.I)
                    if match:
                        url = match.group(1)
                        return url.replace(self.base_url, '')
            else:
                log_utils.log('Skipping S&E matching as title search is forced on: %s' % (video.slug), xbmc.LOGDEBUG)

            if (force_title or xbmcaddon.Addon().getSetting('title-fallback') == 'true') and video.ep_title and title_pattern:
                norm_title = self._normalize_title(video.ep_title)
                for match in re.finditer(title_pattern, html, re.DOTALL | re.I):
                    url, title = match.groups()
                    if norm_title == self._normalize_title(title):
                        return url.replace(self.base_url, '')

    def _force_title(self, video):
            slug_str = xbmcaddon.Addon().getSetting('force_title_match')
            slug_list = slug_str.split('|') if slug_str else []
            return video.slug in slug_list

    def _normalize_title(self, title):
        new_title = title.upper()
        new_title = re.sub('\W', '', new_title)
        # log_utils.log('In title: |%s| Out title: |%s|' % (title,new_title), xbmc.LOGDEBUG)
        return new_title

    def _blog_proc_results(self, html, post_pattern, date_format, video_type, title, year):
        results = []
        match = re.search('(.*?)\s*S\d+E\d+\s*', title)
        if match:
            show_title = match.group(1)
        else:
            match = re.search('(.*?)\s*\d{4}\.\d{2}\.\d{2}\s*', title)
            if match:
                show_title = match.group(1)
            else:
                show_title = title
        norm_title = self._normalize_title(show_title)

        filter_days = datetime.timedelta(days=int(xbmcaddon.Addon().getSetting('%s-filter' % (self.get_name()))))
        today = datetime.date.today()
        for match in re.finditer(post_pattern, html, re.DOTALL):
            post_data = match.groupdict()
            post_title = post_data['post_title']
            if 'quality' in post_data:
                post_title += '- [%s]' % (post_data['quality'])

            if filter_days:
                try: post_date = datetime.datetime.strptime(post_data['date'], date_format).date()
                except TypeError: post_date = datetime.datetime(*(time.strptime(post_data['date'], date_format)[0:6])).date()
                if today - post_date > filter_days:
                    continue

            match_year = ''
            match_title = ''
            post_title = post_title.replace('&#8211;', '-')
            post_title = post_title.replace('&#8217;', "'")
            full_title = post_title
            if video_type == VIDEO_TYPES.MOVIE:
                match = re.search('(.*?)\s*[\[(]?(\d{4})[)\]]?\s*(.*)', post_title)
                if match:
                    match_title, match_year, extra_title = match.groups()
                    full_title = '%s [%s]' % (match_title, extra_title)
            else:
                match = re.search('(.*?)\s*S\d+E\d+\s*(.*)', post_title)
                if match:
                    match_title, extra_title = match.groups()
                    full_title = '%s [%s]' % (match_title, extra_title)
                else:
                    match = re.search('(.*?)\s*\d{4}[ .]?\d{2}[ .]?\d{2}\s*(.*)', post_title)
                    if match:
                        match_title, extra_title = match.groups()
                        full_title = '%s [%s]' % (match_title, extra_title)

            match_norm_title = self._normalize_title(match_title)
            if (match_norm_title in norm_title or norm_title in match_norm_title) and (not year or not match_year or year == match_year):
                result = {'url': post_data['url'].replace(self.base_url, ''), 'title': full_title, 'year': match_year}
                results.append(result)
        return results
    
    def _blog_get_url(self, video, delim='.'):
        url = None
        result = self.db_connection.get_related_url(video.video_type, video.title, video.year, self.get_name(), video.season, video.episode)
        if result:
            url = result[0][0]
            log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (video.video_type, video.title, video.year, self.get_name(), url))
        else:
            select = int(xbmcaddon.Addon().getSetting('%s-select' % (self.get_name())))
            if video.video_type == VIDEO_TYPES.EPISODE:
                temp_title = re.sub('[^A-Za-z0-9 ]', '', video.title)
                if not self._force_title(video):
                    search_title = '%s S%02dE%02d' % (temp_title, int(video.season), int(video.episode))
                    fallback_search = '%s %s' % (temp_title, video.ep_airdate.strftime('%Y{0}%m{0}%d'.format(delim)))
                else:
                    if not video.ep_title: return None
                    search_title = '%s %s' % (temp_title, video.ep_title)
                    fallback_search = ''
            else:
                search_title = '%s %s' % (video.title, video.year)
                fallback_search = ''

            results = self.search(video.video_type, search_title, video.year)
            if not results and fallback_search:
                results = self.search(video.video_type, fallback_search, video.year)
            if results:
                if select == 0:
                    best_result = results[0]
                else:
                    best_qorder = 0
                    best_qstr = ''
                    for result in results:
                        match = re.search('\[(.*)\]$', result['title'])
                        if match:
                            q_str = match.group(1)
                            quality = self._blog_get_quality(video, q_str, '')
                            # print 'result: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality])
                            if Q_ORDER[quality] >= best_qorder:
                                if Q_ORDER[quality] > best_qorder or (quality == QUALITIES.HD and '1080' in q_str and '1080' not in best_qstr):
                                    # print 'Setting best as: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality])
                                    best_qstr = q_str
                                    best_result = result
                                    best_qorder = Q_ORDER[quality]

                url = best_result['url']
                self.db_connection.set_related_url(video.video_type, video.title, video.year, self.get_name(), url)
        return url

    def _blog_get_quality(self, video, q_str, host):
        """
        Use the q_str to determine the post quality; then use the host to determine host quality
        allow the host to drop the quality but not increase it
        """
        q_str.replace(video.title, '')
        q_str.replace(str(video.year), '')
        q_str = q_str.upper()

        post_quality = None
        for key in Q_LIST:
            if any(q in q_str for q in BLOG_Q_MAP[key]):
                post_quality = key

        return self._get_quality(video, host, post_quality)

    def _get_quality(self, video, host, base_quality=None):
        # Assume movies are low quality, tv shows are high quality
        if base_quality is None:
            if video.video_type == VIDEO_TYPES.MOVIE:
                quality = QUALITIES.LOW
            else:
                quality = QUALITIES.HIGH
        else:
            quality = base_quality

        host_quality = None
        if host:
            hl = host.lower()
            for key in HOST_Q:
                if any(hostname in hl for hostname in HOST_Q[key]):
                    host_quality = key
                    break

        # log_utils.log('q_str: %s, host: %s, post q: %s, host q: %s' % (q_str, host, post_quality, host_quality), xbmc.LOGDEBUG)
        if host_quality is not None and Q_ORDER[host_quality] < Q_ORDER[quality]:
            quality = host_quality

        return quality

    def _width_get_quality(self, width):
        width = int(width)
        if width >= 1280:
            quality = QUALITIES.HD
        elif width > 640:
            quality = QUALITIES.HIGH
        elif width > 320:
            quality = QUALITIES.MEDIUM
        else:
            quality = QUALITIES.LOW
        return quality

    def _height_get_quality(self, height):
        height = int(height)
        if height > 480:
            quality = QUALITIES.HD
        elif height >= 400:
            quality = QUALITIES.HIGH
        elif height > 200:
            quality = QUALITIES.MEDIUM
        else:
            quality = QUALITIES.LOW
        return quality
Beispiel #7
0
class DirectDownload_Scraper(scraper.Scraper):
    base_url = BASE_URL

    def __init__(self, timeout=scraper.DEFAULT_TIMEOUT):
        self.timeout = timeout
        self.db_connection = DB_Connection()
        self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' %
                                                     (self.get_name()))
        self.username = xbmcaddon.Addon().getSetting('%s-username' %
                                                     (self.get_name()))
        self.password = xbmcaddon.Addon().getSetting('%s-password' %
                                                     (self.get_name()))

    @classmethod
    def provides(cls):
        return frozenset([VIDEO_TYPES.EPISODE])

    @classmethod
    def get_name(cls):
        return 'DirectDownload.tv'

    def resolve_link(self, link):
        return link

    def format_source_label(self, item):
        return '[%s] (%s) %s' % (item['quality'], item['dd_qual'],
                                 item['host'])

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url:
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)
            if html:
                js_result = json.loads(html)
                query = urlparse.parse_qs(urlparse.urlparse(url).query)
                match_quality = Q_ORDER
                if 'quality' in query:
                    temp_quality = re.sub('\s', '', query['quality'][0])
                    match_quality = temp_quality.split(',')

                import urlresolver
                for result in js_result:
                    if result['quality'] in match_quality:
                        for link in result['links']:
                            # validate url since host validation fails for real-debrid; mark links direct to avoid unusable check
                            if urlresolver.HostedMediaFile(link['url']):
                                hostname = urlparse.urlparse(
                                    link['url']).hostname
                                hoster = {
                                    'multi-part': False,
                                    'class': self,
                                    'views': None,
                                    'url': link['url'],
                                    'rating': None,
                                    'host': hostname,
                                    'quality': QUALITY_MAP[result['quality']],
                                    'dd_qual': result['quality'],
                                    'direct': True
                                }
                                hosters.append(hoster)

        return hosters

    def get_url(self, video):
        url = None
        result = self.db_connection.get_related_url(video.video_type,
                                                    video.title, video.year,
                                                    self.get_name(),
                                                    video.season,
                                                    video.episode)
        if result:
            url = result[0][0]
            log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' %
                          (video.video_type, video.title, video.year,
                           self.get_name(), url))
        else:
            search_title = '%s S%02dE%02d' % (video.title, int(
                video.season), int(video.episode))
            results = self.search(video.video_type, search_title, '')
            best_q_index = -1
            for result in results:
                if Q_DICT[result['quality']] > best_q_index:
                    best_q_index = Q_DICT[result['quality']]
                    url = result['url']
            self.db_connection.set_related_url(video.video_type,
                                               video.title, video.year,
                                               self.get_name(), url)
        return url

    @classmethod
    def get_settings(cls):
        settings = super(DirectDownload_Scraper, cls).get_settings()
        settings = cls._disable_sub_check(settings)
        name = cls.get_name()
        settings.append(
            '         <setting id="%s-username" type="text" label="     Username" default="" visible="eq(-6,true)"/>'
            % (name))
        settings.append(
            '         <setting id="%s-password" type="text" label="     Password" option="hidden" default="" visible="eq(-7,true)"/>'
            % (name))
        return settings

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, '/search?query=')
        search_url += title
        html = self._http_get(search_url, cache_limit=.25)
        results = []
        if html:
            js_result = json.loads(html)
            for match in js_result:
                url = search_url + '&quality=%s' % match['quality']
                result = {
                    'url': url.replace(self.base_url, ''),
                    'title': match['release'],
                    'quality': match['quality'],
                    'year': ''
                }
                results.append(result)
        return results

    def _http_get(self, url, data=None, cache_limit=8):
        # return all uncached blank pages if no user or pass
        if not self.username or not self.password:
            return ''

        if 'search?query' in url:
            log_utils.log('Translating Search Url: %s' % (url), xbmc.LOGDEBUG)
            url = self.__translate_search(url)

        html = super(DirectDownload_Scraper,
                     self)._cached_http_get(url,
                                            self.base_url,
                                            self.timeout,
                                            data=data,
                                            cache_limit=cache_limit)

        fake = None
        try:
            js_result = json.loads(html)
            fake = False
            fake = js_result[0]['fake']
        except:
            pass

        if fake or (fake is None and not re.search(LOGOUT, html)):
            log_utils.log('Logging in for url (%s)' % (url), xbmc.LOGDEBUG)
            self.__login()
            html = super(DirectDownload_Scraper,
                         self)._cached_http_get(url,
                                                self.base_url,
                                                self.timeout,
                                                data=data,
                                                cache_limit=0)

        return html

    def __translate_search(self, url):
        query = urlparse.parse_qs(urlparse.urlparse(url).query)
        quality = re.sub(
            '\s', '',
            query['quality'][0]) if 'quality' in query else ','.join(Q_ORDER)
        return urlparse.urljoin(self.base_url,
                                (SEARCH_URL %
                                 (urllib.quote(query['query'][0]), quality)))

    def __login(self):
        url = self.base_url
        data = {
            'username': self.username,
            'password': self.password,
            'Login': '******'
        }
        html = super(DirectDownload_Scraper,
                     self)._cached_http_get(url,
                                            self.base_url,
                                            self.timeout,
                                            data=data,
                                            cache_limit=0)
        if not re.search(LOGOUT, html):
            raise Exception('directdownload.tv login failed')
class MyVidLinks_Scraper(scraper.Scraper):
    base_url=BASE_URL
    def __init__(self, timeout=scraper.DEFAULT_TIMEOUT):
        self.timeout=timeout
        self.db_connection = DB_Connection()
        self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' % (self.get_name()))
    
    @classmethod
    def provides(cls):
        return frozenset([VIDEO_TYPES.MOVIE, VIDEO_TYPES.EPISODE])
    
    @classmethod
    def get_name(cls):
        return 'MyVideoLinks.eu'
    
    def resolve_link(self, link):
        return link

    def format_source_label(self, item):
        return '[%s] %s (%s Views) (%s/100)' % (item['quality'], item['host'], item['views'], item['rating'])
    
    def get_sources(self, video):
        source_url= self.get_url(video)
        hosters=[]
        if source_url:
            url = urlparse.urljoin(self.base_url,source_url)
            html = self._http_get(url, cache_limit=.5)

            views= None
            pattern = '<span[^>]+>(\d+)\s+Views'
            match = re.search(pattern, html)
            if match:
                views=int(match.group(1))
            
            if video.video_type == VIDEO_TYPES.MOVIE:
                return self.__get_movie_links(video, views, html)
            else:
                return self.__get_episode_links(video, views, html)
        return hosters

    def __get_movie_links(self, video, views, html):
        pattern = 'rel="bookmark"\s+title="Permanent Link to ([^"]+)'
        match = re.search(pattern, html)
        q_str = ''
        if match:
            q_str=match.group(1)
            
        return self.__get_links(video, views, html, q_str)
    
    def __get_episode_links(self, video, views, html):
        pattern = '<h4>(.*?)</h4>(.*?)</ul>'
        hosters=[]
        for match in re.finditer(pattern, html, re.DOTALL):
            q_str, fragment = match.groups()
            hosters += self.__get_links(video, views, fragment, q_str)
        return hosters
    
    def __get_links(self, video, views, html, q_str):
        pattern = 'li>\s*<a\s+href="(http[^"]+)'
        hosters=[]
        for match in re.finditer(pattern, html):
            url=match.group(1)
            hoster={'multi-part': False, 'class': self, 'views': views, 'url': url, 'rating': None, 'quality': None, 'direct': False}
            hoster['host']=urlparse.urlsplit(url).hostname
            hoster['quality']=self._blog_get_quality(video, q_str, hoster['host'])
            hosters.append(hoster)
        return hosters
    
    def get_url(self, video):
        url = None
        result = self.db_connection.get_related_url(video.video_type, video.title, video.year, self.get_name(), video.season, video.episode)
        if result:
            url=result[0][0]
            log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (video.video_type, video.title, video.year, self.get_name(), url))
        else:
            select = int(xbmcaddon.Addon().getSetting('%s-select' % (self.get_name())))
            if video.video_type == VIDEO_TYPES.EPISODE:
                search_title = '%s S%02dE%02d' % (video.title, int(video.season), int(video.episode))
            else:
                search_title = '%s %s' % (video.title, video.year)
            results = self.search(video.video_type, search_title, video.year)
            if results:
                # episodes don't tell us the quality on the search screen so just return the 1st result
                if select == 0 or video.video_type == VIDEO_TYPES.EPISODE:
                    best_result = results[0]
                else:
                    best_qorder=0
                    best_qstr=''
                    for result in results:
                        match = re.search('\[(.*)\]$', result['title'])
                        if match:
                            q_str = match.group(1)
                            quality=self._blog_get_quality(video, q_str, '')
                            #print 'result: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality])
                            if Q_ORDER[quality]>=best_qorder:
                                if Q_ORDER[quality] > best_qorder or (quality == QUALITIES.HD and '1080' in q_str and '1080' not in best_qstr):
                                    #print 'Setting best as: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality])
                                    best_qstr = q_str
                                    best_result=result
                                    best_qorder = Q_ORDER[quality]
                            
                url = best_result['url']
                self.db_connection.set_related_url(video.video_type, video.title, video.year, self.get_name(), url)
        return url

    @classmethod
    def get_settings(cls):
        settings = super(MyVidLinks_Scraper, cls).get_settings()
        settings = cls._disable_sub_check(settings)
        name=cls.get_name()
        settings.append('         <setting id="%s-filter" type="slider" range="0,180" option="int" label="     Filter results older than (0=No Filter) (days)" default="30" visible="eq(-6,true)"/>' % (name))
        settings.append('         <setting id="%s-select" type="enum" label="     Automatically Select (Movies only)" values="Most Recent|Highest Quality" default="0" visible="eq(-7,true)"/>' % (name))
        return settings

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, '/?s=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=.25)
        results=[]
        filter_days = datetime.timedelta(days=int(xbmcaddon.Addon().getSetting('%s-filter' % (self.get_name()))))
        today = datetime.date.today()
        pattern ='<h4>\s*<a\s+href="([^"]+)"\s+rel="bookmark"\s+title="([^"]+)'
        for match in re.finditer(pattern, html, re.DOTALL):
            url, title  = match.groups('')
            if filter_days:
                match = re.search('/(\d{4})/(\d{2})/(\d{2})/', url)
                if match:
                    year, month, day = match.groups()
                    post_date = datetime.date(int(year), int(month), int(day))
                    if today - post_date > filter_days:
                        continue
                
            match_year = ''
            title = title.replace('&#8211;', '-')
            if video_type == VIDEO_TYPES.MOVIE:
                match = re.search('(.*?)\s*[\[(]?(\d{4})[)\]]?\s*(.*)', title)
                if match:
                    title, match_year, extra_title = match.groups()
                    title = '%s [%s]' % (title, extra_title)

            if not year or not match_year or year == match_year:
                result={'url': url.replace(self.base_url,''), 'title': title, 'year': match_year}
                results.append(result)
        return results

    def _http_get(self, url, cache_limit=8):
        return super(MyVidLinks_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, cache_limit=cache_limit)