Ejemplos de parse_dom en Python, ejemplos de salts_lib.dom_parser.parse_dom en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: pelispedia_scraper.py Proyecto: kevintone/tdbaddon

 def __tv_search(self, title, year):
     results = []
     if title:
         norm_title = scraper_utils.normalize_title(title)
         url = '/series/letra/%s/' % (title[0])
         url = urlparse.urljoin(self.base_url, url)
         html = self._http_get(url, cache_limit=48)
         for item in dom_parser.parse_dom(html, 'li', {'class': '[^"]*bpM12[^"]*'}):
             title_frag = dom_parser.parse_dom(item, 'h2')
             year_frag = dom_parser.parse_dom(item, 'div', {'class': '[^"]*sectionDetail[^"]*'})
             match_url = dom_parser.parse_dom(item, 'a', ret='href')
             if title_frag and match_url:
                 match_url = match_url[0]
                 match = re.search('(.*?)<br>', title_frag[0])
                 if match:
                     match_title = match.group(1)
                 else:
                     match_title = title_frag[0]
                     
                 match_year = ''
                 if year_frag:
                     match = re.search('(\d{4})', year_frag[0])
                     if match:
                         match_year = match.group(1)
 
                 if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year):
                     result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                     results.append(result)
                     
     return results

Ejemplo n.º 2

0

Mostrar archivo

Archivo: beinmovie_scraper.py Proyecto: beljim/tknorris-beta-repo

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)
            
            fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*movie_langs_list[^"]*'})
            if fragment:
                for match in re.finditer('href="([^"]+)', fragment[0]):
                    match = re.search('movie-player/(.*)', match.group(1))
                    if match:
                        player_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1)))
                        html = self._http_get(player_url, cache_limit=.5)
                        match = re.search('<source\s+src="([^"]+)', html)
                        if match:
                            stream_url = match.group(1)
                            hoster = {'multi-part': False, 'url': stream_url, 'class': self, 'quality': self._gv_get_quality(stream_url), 'host': self._get_direct_hostname(stream_url), 'rating': None, 'views': None, 'direct': True}
                            hosters.append(hoster)
                        
                        fragment2 = dom_parser.parse_dom(html, 'ul', {'class': 'servers'})
                        if fragment2:
                            for match in re.finditer('href="([^"]+).*?<span>(.*?)</span>', fragment2[0]):
                                other_url, quality = match.groups()
                                match = re.search('movie-player/(.*)', other_url)
                                if match:
                                    other_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1)))
                                    if other_url == player_url: continue
                                    hoster = {'multi-part': False, 'url': other_url, 'class': self, 'quality': QUALITY_MAP.get(quality, QUALITIES.HD720), 'host': self._get_direct_hostname(other_url), 'rating': None, 'views': None, 'direct': True}
                                    hosters.append(hoster)

        return hosters

Ejemplo n.º 3

0

Mostrar archivo

Archivo: uflix_scraper.py Proyecto: assli100/kodi-openelec

 def search(self, video_type, title, year):
     search_url = urlparse.urljoin(self.base_url, '/index.php?menu=search&query=')
     search_url += urllib.quote_plus(title)
     html = self._http_get(search_url, cache_limit=.25)
     results = []
     sections = {VIDEO_TYPES.MOVIE: 'movies', VIDEO_TYPES.TVSHOW: 'series'}
     
     fragment = dom_parser.parse_dom(html, 'div', {'id': sections[video_type]})
     if fragment:
         for item in dom_parser.parse_dom(fragment[0], 'figcaption'):
             match = re.search('title="([^"]+)[^>]+href="([^"]+)', item)
             if match:
                 match_title_year, url = match.groups()
                 match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year)
                 if match:
                     match_title, match_year = match.groups()
                 else:
                     match_title = match_title_year
                     match_year = ''
                 if match_title.startswith('Watch '): match_title = match_title.replace('Watch ', '')
                 if match_title.endswith(' Online'): match_title = match_title.replace(' Online', '')
                 
                 if not year or not match_year or year == match_year:
                     result = {'title': match_title, 'url': scraper_utils.pathify_url(url), 'year': match_year}
                     results.append(result)
     return results

Ejemplo n.º 4

0

Mostrar archivo

Archivo: pctf_scraper.py Proyecto: rickardrocks/tknorris-beta-repo

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, '/?query=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=.25)
        results = []
        info = dom_parser.parse_dom(html, 'div', {'class': 'movie-info'})
        for item in info:
            match_title = dom_parser.parse_dom(item, 'span', {'class': 'movie-title'})
            match_year = dom_parser.parse_dom(item, 'span', {'class': 'movie-year'})
            if match_title:
                match_title = self.__strip_link(match_title[0])
                if match_year:
                    match_year = self.__strip_link(match_year[0])
                else:
                    match_year = ''
                    
                match = re.search('href="([^"]+)', item)
                if match:
                    url = match.group(1)
                else:
                    continue
    
                if not year or not match_year or year == match_year:
                    result = {'title': match_title, 'year': match_year, 'url': url.replace(self.base_url, '')}
                    results.append(result)

        return results

Ejemplo n.º 5

0

Mostrar archivo

Archivo: yshows_scraper.py Proyecto: henry73/salts

 def get_sources(self, video):
     source_url = self.get_url(video)
     hosters = []
     if source_url and source_url != FORCE_NO_MATCH:
         page_url = urlparse.urljoin(self.base_url, source_url)
         html = self._http_get(page_url, cache_limit=.25)
         fragment = dom_parser.parse_dom(html, 'tbody')
         if fragment:
             links = dom_parser.parse_dom(fragment[0], 'a', ret='href')
             domains = dom_parser.parse_dom(fragment[0], 'a')
             for link, host in zip(links, domains):
                 host = re.sub('</?span[^>]*>', '', host)
                 hoster = {
                     'multi-part':
                     False,
                     'host':
                     host,
                     'class':
                     self,
                     'quality':
                     scraper_utils.get_quality(video, host, QUALITIES.HIGH),
                     'views':
                     None,
                     'rating':
                     None,
                     'url':
                     link,
                     'direct':
                     False
                 }
                 hosters.append(hoster)
     return hosters

Ejemplo n.º 6

0

Mostrar archivo

Archivo: funtastic_scraper.py Proyecto: azumimuo/family-xbmc-addon

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            page_url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(page_url, cache_limit=.25)

            q_str = ''
            match = re.search('class="calishow">([^<]+)', html)
            if match:
                q_str = match.group(1)
            else:
                match = re.search('<a[^>]*href="#embed\d*"[^>]+>([^<]+)', html)
                if match:
                    q_str = match.group(1)
                
            fragment = dom_parser.parse_dom(html, 'div', {'class': 'tab-content'})
            if fragment:
                for source in dom_parser.parse_dom(fragment[0], 'iframe', ret='src'):
                    host = urlparse.urlparse(source).hostname
                    quality = scraper_utils.blog_get_quality(video, q_str, host)
                    hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': source, 'direct': False}
                    hosters.append(hoster)

                fragment = dom_parser.parse_dom(html, 'div', {'id': 'olmt'})
                if fragment:
                    hosters += self.__get_links(video, fragment[0])
                fragment = dom_parser.parse_dom(html, 'div', {'id': 'dlnmt'})
                if fragment:
                    hosters += self.__get_links(video, fragment[0])
            
            hosters = dict((stream['url'], stream) for stream in hosters).values()
        return hosters

Ejemplo n.º 7

0

Mostrar archivo

Archivo: moviewatcher_scraper.py Proyecto: freeworldxbmc/KAOSbox-Repo

    def __movie_search(self, title, year):
        results = []
        search_url = urlparse.urljoin(self.base_url, '/search?q=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=1)
        norm_title = scraper_utils.normalize_title(title)
        for item in dom_parser.parse_dom(html, 'div', {'class': 'video_item'}):
            match_url = dom_parser.parse_dom(item, 'a', ret='href')
            match_title = dom_parser.parse_dom(item, 'img', ret='alt')
            match_year = ''
            if match_url and match_title:
                match_url = match_url[0]
                match_title = match_title[0]
                
                if match_year:
                    match_year = match_year[0]
                else:
                    match_year = ''
        
                if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year):
                    result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                    results.append(result)

        return results

Ejemplo n.º 8

0

Mostrar archivo

Archivo: tunemovie_scraper.py Proyecto: kevintone/tdbaddon

 def __get_gk_links(self, html, page_url):
     sources = {}
     for link in dom_parser.parse_dom(html, 'div', {'class': '[^"]*server_line[^"]*'}):
         film_id = dom_parser.parse_dom(link, 'a', ret='data-film')
         name_id = dom_parser.parse_dom(link, 'a', ret='data-name')
         server_id = dom_parser.parse_dom(link, 'a', ret='data-server')
         if film_id and name_id and server_id:
             data = {'ipplugins': 1, 'ip_film': film_id[0], 'ip_server': server_id[0], 'ip_name': name_id[0]}
             headers = XHR
             headers['Referer'] = page_url
             url = urlparse.urljoin(self.base_url, LINK_URL)
             html = self._http_get(url, data=data, headers=headers, cache_limit=.25)
             js_data = scraper_utils.parse_json(html, url)
             if 's' in js_data:
                 if isinstance(js_data['s'], basestring):
                     sources[js_data['s']] = QUALITIES.HIGH
                 else:
                     for link in js_data['s']:
                         stream_url = link['file']
                         if self._get_direct_hostname(stream_url) == 'gvideo':
                             quality = scraper_utils.gv_get_quality(stream_url)
                         elif 'label' in link:
                             quality = scraper_utils.height_get_quality(link['label'])
                         else:
                             quality = QUALITIES.HIGH
                         sources[stream_url] = quality
     return sources

Ejemplo n.º 9

0

Mostrar archivo

Archivo: tunemovie_scraper.py Proyecto: kevintone/tdbaddon

 def search(self, video_type, title, year, season=''):
     search_url = urlparse.urljoin(self.base_url, '/search-movies/%s.html')
     search_url = search_url % (urllib.quote_plus(title))
     html = self._http_get(search_url, cache_limit=0)
     results = []
     for thumb in dom_parser.parse_dom(html, 'div', {'class': 'thumb'}):
         match_title = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='title')
         url = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='href')
         if match_title and url:
             match_title, url = match_title[0], url[0]
             is_season = re.search('Season\s+(\d+)$', match_title, re.I)
             if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON:
                 match_year = ''
                 if video_type == VIDEO_TYPES.MOVIE:
                     match_year = dom_parser.parse_dom(thumb, 'div', {'class': '[^"]*status-year[^"]*'})
                     if match_year:
                         match_year = match_year[0]
                 else:
                     if season and int(is_season.group(1)) != int(season):
                         continue
                 
                 if not year or not match_year or year == match_year:
                     result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                     results.append(result)
     return results

Ejemplo n.º 10

0

Mostrar archivo

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, '/movie/search/')
        search_url += title
        html = self._http_get(search_url, cache_limit=1)
        results = []
        for item in dom_parser.parse_dom(html, 'div', {'class': 'ml-item'}):
            match_title = dom_parser.parse_dom(item, 'span',
                                               {'class': 'mli-info'})
            match_url = re.search('href="([^"]+)', item, re.DOTALL)
            match_year = re.search('class="jt-info">(\d{4})<', item)
            is_episodes = dom_parser.parse_dom(item, 'span',
                                               {'class': 'mli-eps'})

            if match_title and match_url and not is_episodes:
                match_title = match_title[0]
                match_title = re.sub('</?h2>', '', match_title)
                match_title = re.sub('\s+\d{4}$', '', match_title)
                url = urlparse.urljoin(match_url.group(1), 'watching.html')
                match_year = match_year.group(1) if match_year else ''

                if not year or not match_year or year == match_year:
                    result = {
                        'title': match_title,
                        'year': match_year,
                        'url': self._pathify_url(url)
                    }
                    results.append(result)

        return results

Ejemplo n.º 11

0

Mostrar archivo

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)
            html = html.decode('utf-8', 'ignore')
            fragment = dom_parser.parse_dom(html, 'div',
                                            {'class': 'list-wrap'})
            if fragment:
                for stream_url in dom_parser.parse_dom(fragment[0],
                                                       'iframe',
                                                       ret='src'):
                    host = urlparse.urlparse(stream_url).hostname
                    hoster = {
                        'multi-part': False,
                        'host': host,
                        'url': stream_url,
                        'class': self,
                        'rating': None,
                        'views': None,
                        'quality': QUALITIES.HIGH,
                        'direct': True
                    }
                    hosters.append(hoster)

        return hosters

Ejemplo n.º 12

0

Mostrar archivo

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, '/arsiv?limit=&tur=&orderby=&ulke=&order=&yil=&dizi_adi=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=8)
        results = []
        for item in dom_parser.parse_dom(html, 'div', {'class': 'tv-series-single'}):
            try:
                url = re.search('href="([^"]+)', item).group(1)
            except:
                url = ''

            try:
                match_year = re.search('<span>\s*(\d{4})\s*</span>', item).group(1)
            except:
                match_year = ''
            
            try:
                match_title = dom_parser.parse_dom(item, 'a', {'class': 'title'})
                re.search('([^>]+)$', match_title[0]).group(1)
            except:
                match_title = ''
                
            if url and match_title and (not year or not match_year or year == match_year):
                result = {'url': url.replace(self.base_url, ''), 'title': match_title, 'year': ''}
                results.append(result)

        return results

Ejemplo n.º 13

0

Mostrar archivo

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.__get_base_url(video_type), '/search/%s.html' % (urllib.quote_plus(title)))
        html = self._http_get(search_url, cache_limit=1)
        fragment = dom_parser.parse_dom(html, 'ul', {'class': 'cfv'})
        if fragment:
            for item in dom_parser.parse_dom(fragment[0], 'li'):
                is_season = dom_parser.parse_dom(item, 'div', {'class': 'status'})
                if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON:
                    match_url = dom_parser.parse_dom(item, 'a', ret='href')
                    match_title = dom_parser.parse_dom(item, 'a', ret='title')
                    if match_url and match_title:
                        match_title = match_title[0]
                        match_url = match_url[0]
                        match_year = ''
                        if video_type == VIDEO_TYPES.SEASON:
                            if season and not re.search('Season\s+%s$' % (season), match_title, re.I):
                                continue
                        else:
                            match = re.search('-(\d{4})\.html', match_url)
                            if match:
                                match_year = match.group(1)
                        
                        if not year or not match_year or year == match_year:
                            result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
                            results.append(result)

        return results

Ejemplo n.º 14

0

Mostrar archivo

 def __get_gk_links(self, html, page_url, video_type, episode):
     sources = {}
     phimid = dom_parser.parse_dom(html, 'input', {'name': 'phimid'}, ret='value')
     if phimid and video_type == VIDEO_TYPES.EPISODE:
         url = urlparse.urljoin(self.tv_base_url, '/ajax.php')
         data = {'ipos_server': 1, 'phimid': phimid[0], 'keyurl': episode}
         headers = XHR
         headers['Referer'] = page_url
         html = self._http_get(url, data=data, headers=headers, cache_limit=.5)
         
     for link in dom_parser.parse_dom(html, 'div', {'class': '[^"]*server_line[^"]*'}):
         film_id = dom_parser.parse_dom(link, 'a', ret='data-film')
         name_id = dom_parser.parse_dom(link, 'a', ret='data-name')
         server_id = dom_parser.parse_dom(link, 'a', ret='data-server')
         if film_id and name_id and server_id:
             data = {'ipplugins': 1, 'ip_film': film_id[0], 'ip_server': server_id[0], 'ip_name': name_id[0]}
             headers = XHR
             headers['Referer'] = page_url
             url = urlparse.urljoin(self.__get_base_url(video_type), LINK_URL)
             html = self._http_get(url, data=data, headers=headers, cache_limit=.25)
             js_data = scraper_utils.parse_json(html, url)
             if 's' in js_data:
                 if isinstance(js_data['s'], basestring):
                     sources[js_data['s']] = QUALITIES.HIGH
                 else:
                     for link in js_data['s']:
                         stream_url = link['file']
                         if self._get_direct_hostname(stream_url) == 'gvideo':
                             quality = scraper_utils.gv_get_quality(stream_url)
                         elif 'label' in link:
                             quality = scraper_utils.height_get_quality(link['label'])
                         else:
                             quality = QUALITIES.HIGH
                         sources[stream_url] = quality
     return sources

Ejemplo n.º 15

0

Mostrar archivo

Archivo: torba_scraper.py Proyecto: Stevie-Bs/repository.xvbmc

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, SEARCH_URL)
        search_url = search_url % (urllib.quote_plus(title))
        html = self._http_get(search_url, headers=XHR, cache_limit=1)
        for film in dom_parser.parse_dom(html, 'li', {'class': 'films-item'}):
            match_url = dom_parser.parse_dom(film, 'a', ret='href')
            match_title = dom_parser.parse_dom(film, 'div', {'class': 'films-item-title'})
            match_year = dom_parser.parse_dom(film, 'div', {'class': 'films-item-year'})
            if match_url and match_title:
                match_url = match_url[0]
                match_title = match_title[0]
                match_title = re.sub('</?span>', '', match_title)
                if match_year:
                    match = re.search('(\d+)', match_year[0])
                    if match:
                        match_year = match.group(1)
                    else:
                        match_year = ''
                else:
                    match_year = ''
                    
                if not year or not match_year or year == match_year:
                    result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': match_url}
                    results.append(result)

        return results

Ejemplo n.º 16

0

Mostrar archivo

 def get_sources(self, video):
     source_url = self.get_url(video)
     hosters = []
     if source_url and source_url != FORCE_NO_MATCH:
         url = urlparse.urljoin(self.base_url, source_url)
         html = self._http_get(url, cache_limit=.5)
         fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*screen[^"]*'})
         if fragment:
             js_src = dom_parser.parse_dom(fragment[0], 'script', ret='src')
             if js_src:
                 js_url = urlparse.urljoin(self.base_url, js_src[0])
                 html = self._http_get(js_url, cache_limit=.5)
             else:
                 html = fragment[0]
                 
             for match in re.finditer('<source[^>]+src="([^"]+)', html):
                 stream_url = match.group(1)
                 host = self._get_direct_hostname(stream_url)
                 if host == 'gvideo':
                     quality = scraper_utils.gv_get_quality(stream_url)
                 else:
                     _, _, height, _ = scraper_utils.parse_movie_link(stream_url)
                     quality = scraper_utils.height_get_quality(height)
                     stream_url += '|User-Agent=%s' % (scraper_utils.get_ua())
                     
                 hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True}
                 hosters.append(hoster)
     return hosters

Ejemplo n.º 17

0

Mostrar archivo

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, '/?query=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=.25)
        results = []
        info = dom_parser.parse_dom(html, 'div', {'class': 'movie-info'})
        for item in info:
            match_title = dom_parser.parse_dom(item, 'span',
                                               {'class': 'movie-title'})
            match_year = dom_parser.parse_dom(item, 'span',
                                              {'class': 'movie-year'})
            if match_title:
                match_title = self.__strip_link(match_title[0])
                if match_year:
                    match_year = self.__strip_link(match_year[0])
                else:
                    match_year = ''

                match = re.search('href="([^"]+)', item)
                if match:
                    url = match.group(1)
                else:
                    continue

                if not year or not match_year or year == match_year:
                    result = {
                        'title': match_title,
                        'year': match_year,
                        'url': url.replace(self.base_url, '')
                    }
                    results.append(result)

        return results

Ejemplo n.º 18

0

Mostrar archivo

    def search(self, video_type, title, year, season=''):
        results = []
        if video_type == VIDEO_TYPES.MOVIE:
            search_url = urlparse.urljoin(self.base_url, '/?s=')
            search_url += urllib.quote_plus('%s' % (title))
            html = self._http_get(search_url, cache_limit=1)
            links = dom_parser.parse_dom(html, 'a', {'class': 'clip-link'}, 'href')
            titles = dom_parser.parse_dom(html, 'a', {'class': 'clip-link'}, 'title')
            matches = zip(links, titles)
        else:
            html = self._http_get(self.base_url, cache_limit=8)
            matches = re.findall('<li\s+class="cat-item[^>]+>\s*<a\s+href="([^"]+)[^>]+>([^<]+)', html)
                
        norm_title = scraper_utils.normalize_title(title)
        for item in matches:
            url, match_title_year = item
            match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year)
            if match:
                match_title, match_year = match.groups()
            else:
                match_title = match_title_year
                match_year = ''
            
            if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year):
                result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url)}
                results.append(result)

        return results

Ejemplo n.º 19

0

Mostrar archivo

Archivo: filmstreaming_scraper.py Proyecto: beljim/tknorris-beta-repo

 def get_sources(self, video):
     source_url = self.get_url(video)
     hosters = []
     if source_url and source_url != FORCE_NO_MATCH:
         url = urlparse.urljoin(self.base_url, source_url)
         html = self._http_get(url, cache_limit=.5)
         q_str = dom_parser.parse_dom(html, 'span', {'class': 'calidad\d*'})
         if q_str:
             if q_str[0].upper() == 'COMING SOON':
                 return hosters
             
             try:
                 quality = self._height_get_quality(q_str[0])
             except:
                 quality = QUALITIES.HIGH
         else:
             quality = QUALITIES.HIGH
         fragment = dom_parser.parse_dom(html, 'div', {'id': 'player\d+'})
         if fragment:
             for match in re.finditer('<iframe[^>]+src="([^"]+)', fragment[0], re.I):
                 stream_url = match.group(1)
                 host = urlparse.urlparse(stream_url).hostname
                 hoster = {'multi-part': False, 'url': stream_url, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'direct': False}
                 hosters.append(hoster)
     return hosters

Ejemplo n.º 20

0

Mostrar archivo

    def get_sources(self, video):
        source_url = self.get_url(video)
        sources = []
        if source_url and source_url != FORCE_NO_MATCH:
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)
            for item in dom_parser.parse_dom(html, 'li',
                                             {'class': 'elemento'}):
                match = re.search('href="([^"]+)', item)
                if match:
                    stream_url = match.group(1)
                    q_str = dom_parser.parse_dom(item, 'span', {'class': 'd'})
                    q_str = q_str[0].upper() if q_str else ''
                    base_quality = QUALITY_MAP.get(q_str, QUALITIES.HIGH)
                    host = urlparse.urlparse(stream_url).hostname
                    quality = scraper_utils.get_quality(
                        video, host, base_quality)
                    source = {
                        'multi-part': False,
                        'url': stream_url,
                        'host': host,
                        'class': self,
                        'quality': quality,
                        'views': None,
                        'rating': None,
                        'direct': False
                    }
                    sources.append(source)

        return sources

Ejemplo n.º 21

0

Mostrar archivo

Archivo: watchepisodes_scraper.py Proyecto: monicarero/repository.xvbmc

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            page_url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(page_url, cache_limit=.25)
            for link in dom_parser.parse_dom(html, 'div', {'class': '[^"]*ldr-item[^"]*'}):
                stream_url = dom_parser.parse_dom(link, 'a', ret='data-actuallink')
                
                views = None
                watched = dom_parser.parse_dom(link, 'div', {'class': 'click-count'})
                if watched:
                    match = re.search(' (\d+) ', watched[0])
                    if match:
                        views = match.group(1)
                        
                score = dom_parser.parse_dom(link, 'div', {'class': '\s*point\s*'})
                if score:
                    score = int(score[0])
                    rating = score * 10 if score else None
                
                if stream_url:
                    stream_url = stream_url[0].strip()
                    host = urlparse.urlparse(stream_url).hostname
                    quality = scraper_utils.get_quality(video, host, QUALITIES.HIGH)
                    hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': views, 'rating': rating, 'url': stream_url, 'direct': False}
                    hosters.append(hoster)

        return hosters

Ejemplo n.º 22

0

Mostrar archivo

Archivo: diziay_scraper.py Proyecto: beljim/tknorris-beta-repo

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            page_url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(page_url, cache_limit=1)
            fragment = dom_parser.parse_dom(html, 'div', {'class': 'player'})
            if fragment:
                iframe_url = dom_parser.parse_dom(fragment[0], 'iframe', ret='src')
                if iframe_url:
                    html = self._http_get(iframe_url[0], cache_limit=.5)

                    # if captions exist, then they aren't hardcoded
                    if re.search('kind\s*:\s*"captions"', html):
                        subs = False
                    else:
                        subs = True
                        
                    match = re.search('sources\s*:\s*\[(.*?)\]', html)
                    if match:
                        for match in re.finditer('"file"\s*:\s*"([^"]+)', match.group(1)):
                            stream_url = match.group(1)
                            if self._get_direct_hostname(stream_url) == 'gvideo':
                                quality = self._gv_get_quality(stream_url)
                                hoster = {'multi-part': False, 'host': self._get_direct_hostname(stream_url), 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True, 'subs': subs}
                                hosters.append(hoster)
    
        return hosters

Ejemplo n.º 23

0

Mostrar archivo

Archivo: dizilab_scraper.py Proyecto: sfennell/salts

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, '/arsiv?limit=&tur=&orderby=&ulke=&order=&yil=&dizi_adi=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=8)
        results = []
        for item in dom_parser.parse_dom(html, 'div', {'class': 'tv-series-single'}):
            try:
                url = re.search('href="([^"]+)', item).group(1)
            except:
                url = ''

            try:
                match_year = re.search('<span>\s*(\d{4})\s*</span>', item).group(1)
            except:
                match_year = ''
            
            try:
                match_title = dom_parser.parse_dom(item, 'a', {'class': 'title'})
                match_title = re.search('([^>]+)$', match_title[0]).group(1)
                match_title = match_title.strip()
            except:
                match_title = ''
            
            if url and match_title and (not year or not match_year or year == match_year):
                result = {'url': self._pathify_url(url), 'title': match_title, 'year': ''}
                results.append(result)

        return results

Ejemplo n.º 24

0

Mostrar archivo

    def search(self, video_type, title, year):
        results = []
        search_url = urlparse.urljoin(self.__get_base_url(video_type), '/?s=%s' % (urllib.quote_plus(title)))
        html = self._http_get(search_url, cache_limit=1)
        for movie in dom_parser.parse_dom(html, 'div', {'class': 'movie'}):
            match = re.search('href="([^"]+)', movie)
            if match:
                match_url = match.group(1)
                if re.search('season-\d+-episode\d+', match_url): continue
                match_title_year = dom_parser.parse_dom(movie, 'img', ret='alt')
                if match_title_year:
                    match_title_year = match_title_year[0]
                    match = re.search('(.*?)\s+\((\d{4})\)', match_title_year)
                    if match:
                        match_title, match_year = match.groups()
                    else:
                        match_title = match_title_year
                        match_year = dom_parser.parse_dom(movie, 'div', {'class': 'year'})
                        try: match_year = match_year[0]
                        except: match_year = ''
                        
                    if not year or not match_year or year == match_year:
                        result = {'url': self._pathify_url(match_url), 'title': match_title, 'year': match_year}
                        results.append(result)

        return results

Ejemplo n.º 25

0

Mostrar archivo

Archivo: dizibox_scraper.py Proyecto: c0ns0le/YCBuilds

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            page_url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(page_url, cache_limit=.25)
            match = re.search('''<option[^>]+value\s*=\s*["']([^"']+)[^>]*>(?:Altyaz.{1,3}s.{1,3}z)<''', html)
            if match:
                option_url = urlparse.urljoin(self.base_url, match.group(1))
                html = self._http_get(option_url, cache_limit=.25)
                fragment = dom_parser.parse_dom(html, 'span', {'class': 'object-wrapper'})
                if fragment:
                    iframe_url = dom_parser.parse_dom(fragment[0], 'iframe', ret='src')
                    if iframe_url:
                        html = self._http_get(iframe_url[0], cache_limit=.25)

                        seen_urls = {}
                        for match in re.finditer('"?file"?\s*:\s*"([^"]+)"\s*,\s*"?label"?\s*:\s*"(\d+)p?[^"]*"', html):
                            stream_url, height = match.groups()
                            if stream_url not in seen_urls:
                                seen_urls[stream_url] = True
                                stream_url += '|User-Agent=%s' % (scraper_utils.get_ua())
                                host = self._get_direct_hostname(stream_url)
                                if host == 'gvideo':
                                    quality = scraper_utils.gv_get_quality(stream_url)
                                else:
                                    quality = scraper_utils.height_get_quality(height)
                                hoster = {'multi-part': False, 'host': self._get_direct_hostname(stream_url), 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True}
                                hosters.append(hoster)
    
        return hosters

Ejemplo n.º 26

0

Mostrar archivo

Archivo: torbase_scraper.py Proyecto: SQL-MisterMagoo/salts

    def search(self, video_type, title, year):
        results = []
        search_url = urlparse.urljoin(self.base_url, SEARCH_URL)
        search_url = search_url % (urllib.quote_plus(title))
        html = self._http_get(search_url, headers=XHR, cache_limit=1)
        for film in dom_parser.parse_dom(html, "li", {"class": "films-item"}):
            match_url = dom_parser.parse_dom(film, "a", ret="href")
            match_title = dom_parser.parse_dom(film, "div", {"class": "films-item-title"})
            match_year = dom_parser.parse_dom(film, "div", {"class": "films-item-year"})
            if match_url and match_title:
                match_url = match_url[0]
                match_title = match_title[0]
                match_title = re.sub("</?span>", "", match_title)
                if match_year:
                    match = re.search("(\d+)", match_year[0])
                    if match:
                        match_year = match.group(1)
                    else:
                        match_year = ""
                else:
                    match_year = ""

                if not year or not match_year or year == match_year:
                    result = {"title": match_title, "year": match_year, "url": match_url}
                    results.append(result)

        return results

Ejemplo n.º 27

0

Mostrar archivo

Archivo: miradetodo_scraper.py Proyecto: kevintone/tdbaddon

    def search(self, video_type, title, year, season=''):
        search_url = urlparse.urljoin(self.base_url, '/?s=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=1)
        results = []
        for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}):
            match = re.search('href="([^"]+)', item)
            match_title = dom_parser.parse_dom(item, 'span', {'class': 'tt'})
            year_frag = dom_parser.parse_dom(item, 'span', {'class': 'year'})
            if match and match_title:
                url = match.group(1)
                match_title = match_title[0]
                if re.search('\d+\s*x\s*\d+', match_title):
                    continue  # exclude episodes
                match = re.search('(.*?)\s+\((\d{4})\)', match_title)
                if match:
                    match_title, match_year = match.groups()
                else:
                    match_title = match_title
                    match_year = ''

                if year_frag:
                    match_year = year_frag[0]

                if not year or not match_year or year == match_year:
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': scraper_utils.pathify_url(url)
                    }
                    results.append(result)

        return results

Ejemplo n.º 28

0

Mostrar archivo

 def search(self, video_type, title, year, season=''):
     search_url = urlparse.urljoin(self.base_url, '/search.php?q=%s&limit=20&timestamp=%s' % (urllib.quote_plus(title), int(time.time())))
     html = self._http_get(search_url, cache_limit=.25)
     results = []
     items = dom_parser.parse_dom(html, 'li')
     if len(items) >= 2:
         items = items[1:]
         for item in items:
             match_url = dom_parser.parse_dom(item, 'a', ret='href')
             match_title_year = dom_parser.parse_dom(item, 'strong')
             if match_url and match_title_year:
                 match_url = match_url[0]
                 match_title_year = re.sub('</?strong>', '', match_title_year[0])
                 is_season = re.search('S(?:eason\s+)?(\d+)$', match_title_year, re.I)
                 if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON:
                     if video_type == VIDEO_TYPES.MOVIE:
                         match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year)
                         if match:
                             match_title, match_year = match.groups()
                         else:
                             match_title = match_title_year
                             match_year = ''
                     else:
                         log_utils.log(is_season.group(1))
                         if season and int(is_season.group(1)) != int(season):
                             continue
                         match_title = match_title_year
                         match_year = ''
                 
                     result = {'title': match_title, 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
                     results.append(result)
     return results

Ejemplo n.º 29

0

Mostrar archivo

Archivo: vivoto_scraper.py Proyecto: freeworldxbmc/KAOSbox-Repo

 def search(self, video_type, title, year, season=''):
     search_url = urlparse.urljoin(self.base_url, '/search/%s.html')
     search_url = search_url % (urllib.quote_plus(title))
     html = self._http_get(search_url, cache_limit=1)
     results = []
     fragment = dom_parser.parse_dom(html, 'div', {'class': 'movie'})
     if fragment:
         for item in dom_parser.parse_dom(fragment[0], 'li'):
             match_url = dom_parser.parse_dom(item, 'a', ret='href')
             match_title = dom_parser.parse_dom(item, 'span', {'class': 'text'})
             match_year = dom_parser.parse_dom(item, 'span', {'class': 'year'})
             if match_url and match_title:
                 match_url = match_url[0]
                 match_title = re.sub('</?strong>', '', match_title[0])
                 is_season = re.search('Season\s+(\d+)$', match_title, re.I)
                 if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (is_season and video_type == VIDEO_TYPES.SEASON):
                     if video_type == VIDEO_TYPES.MOVIE:
                         if match_year:
                             match_year = match_year[0]
                         else:
                             match_year = ''
                     else:
                         if season and int(is_season.group(1)) != int(season):
                             continue
                         match_year = ''
                 
                     result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
                     results.append(result)
     return results

Ejemplo n.º 30

0

Mostrar archivo

Archivo: 2ddl_scraper.py Proyecto: freeworldxbmc/KAOSbox-Repo

 def _get_episode_url(self, show_url, video):
     sxe = '(\.|_| )S%02dE%02d(\.|_| )' % (int(video.season), int(video.episode))
     force_title = scraper_utils.force_title(video)
     title_fallback = kodi.get_setting('title-fallback') == 'true'
     norm_title = scraper_utils.normalize_title(video.ep_title)
     try: airdate_pattern = video.ep_airdate.strftime('(\.|_| )%Y(\.|_| )%m(\.|_| )%d(\.|_| )')
     except: airdate_pattern = ''
     
     page_url = [show_url]
     too_old = False
     while page_url and not too_old:
         url = urlparse.urljoin(self.base_url, page_url[0])
         html = self._http_get(url, require_debrid=True, cache_limit=1)
         posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'})
         for post in posts:
             if self.__too_old(post):
                 too_old = True
                 break
             if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post:
                 match = re.search('<a\s+href="([^"]+)[^>]+>(.*?)</a>', post)
                 if match:
                     url, title = match.groups()
                     if not force_title:
                         if re.search(sxe, title) or (airdate_pattern and re.search(airdate_pattern, title)):
                             return scraper_utils.pathify_url(url)
                     else:
                         if title_fallback and norm_title:
                             match = re.search('</strong>(.*?)</p>', post)
                             if match and norm_title == scraper_utils.normalize_title(match.group(1)):
                                 return scraper_utils.pathify_url(url)
             
         page_url = dom_parser.parse_dom(html, 'a', {'class': 'nextpostslink'}, ret='href')

Ejemplo n.º 31

0

Mostrar archivo

Archivo: pelispedia_scraper.py Proyecto: kevintone/tdbaddon

 def get_sources(self, video):
     source_url = self.get_url(video)
     hosters = []
     if source_url and source_url != FORCE_NO_MATCH:
         url = urlparse.urljoin(self.base_url, source_url)
         html = self._http_get(url, cache_limit=.5)
         fragment = dom_parser.parse_dom(html, 'div', {'class': 'repro'})
         if fragment:
             iframe_url = dom_parser.parse_dom(fragment[0], 'iframe', ret='src')
             if iframe_url:
                 html = self._http_get(iframe_url[0], cache_limit=.5)
                 fragment = dom_parser.parse_dom(html, 'div', {'id': 'botones'})
                 if fragment:
                     for media_url in dom_parser.parse_dom(fragment[0], 'a', ret='href'):
                         if self.base_url in media_url or 'pelispedia.biz' in media_url:
                             headers = {'Referer': iframe_url[0]}
                             html = self._http_get(media_url, headers=headers, cache_limit=.5)
                             hosters += self.__get_page_links(html)
                             hosters += self.__get_pk_links(html)
                             hosters += self.__get_gk_links(html, url)
                         else:
                             host = urlparse.urlparse(media_url).hostname
                             hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': QUALITIES.HD720, 'views': None, 'rating': None, 'url': media_url, 'direct': False}
                             hosters.append(hoster)
         
     return hosters

Ejemplo n.º 32

0

Mostrar archivo

Archivo: watch5s_scraper.py Proyecto: freeworldxbmc/KAOSbox-Repo

    def search(self, video_type, title, year, season=''):
        search_url = urlparse.urljoin(self.base_url, '/search/?q=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=8)
        results = []
        for item in dom_parser.parse_dom(html, 'div', {'class': 'ml-item'}):
            match_title = dom_parser.parse_dom(item, 'span', {'class': 'mli-info'})
            match_url = re.search('href="([^"]+)', item, re.DOTALL)
            year_frag = dom_parser.parse_dom(item, 'img', ret='alt')
            is_episodes = dom_parser.parse_dom(item, 'span', {'class': 'mli-eps'})
            
            if (video_type == VIDEO_TYPES.MOVIE and not is_episodes) or (video_type == VIDEO_TYPES.SEASON and is_episodes):
                if match_title and match_url:
                    match_url = match_url.group(1)
                    match_title = match_title[0]
                    match_title = re.sub('</?h2>', '', match_title)
                    match_title = re.sub('\s+\d{4}$', '', match_title)
                    if video_type == VIDEO_TYPES.SEASON:
                        if season and not re.search('Season\s+%s$' % (season), match_title): continue
                        
                    if not match_url.endswith('/'): match_url += '/'
                    match_url = urlparse.urljoin(match_url, 'watch/')
                    match_year = ''
                    if video_type == VIDEO_TYPES.MOVIE and year_frag:
                        match = re.search('\s*-\s*(\d{4})$', year_frag[0])
                        if match:
                            match_year = match.group(1)
    
                    if not year or not match_year or year == match_year:
                        result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
                        results.append(result)

        return results

Ejemplo n.º 33

0

Mostrar archivo

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, SEARCH_URL)
        search_url = search_url % (urllib.quote_plus(title))
        html = self._http_get(search_url, headers=XHR, cache_limit=1)
        for film in dom_parser.parse_dom(html, 'li', {'class': 'films-item'}):
            match_url = dom_parser.parse_dom(film, 'a', ret='href')
            match_title = dom_parser.parse_dom(film, 'div',
                                               {'class': 'films-item-title'})
            match_year = dom_parser.parse_dom(film, 'div',
                                              {'class': 'films-item-year'})
            if match_url and match_title:
                match_url = match_url[0]
                match_title = match_title[0]
                match_title = re.sub('</?span>', '', match_title)
                if match_year:
                    match = re.search('(\d+)', match_year[0])
                    if match:
                        match_year = match.group(1)
                    else:
                        match_year = ''
                else:
                    match_year = ''

                if not year or not match_year or year == match_year:
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': match_url
                    }
                    results.append(result)

        return results

Ejemplo n.º 34

0

Mostrar archivo

 def _get_episode_url(self, show_url, video):
     sxe = '(\.|_| )S%02dE%02d(\.|_| )' % (int(video.season), int(video.episode))
     force_title = scraper_utils.force_title(video)
     title_fallback = kodi.get_setting('title-fallback') == 'true'
     norm_title = scraper_utils.normalize_title(video.ep_title)
     try: airdate_pattern = video.ep_airdate.strftime('(\.|_| )%Y(\.|_| )%m(\.|_| )%d(\.|_| )')
     except: airdate_pattern = ''
     
     page_url = [show_url]
     too_old = False
     while page_url and not too_old:
         url = urlparse.urljoin(self.base_url, page_url[0])
         html = self._http_get(url, cache_limit=1)
         headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html)
         posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'})
         for heading, post in zip(headings, posts):
             if self.__too_old(post):
                 too_old = True
                 break
             if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post:
                 url, title = heading
                 if not force_title:
                     if re.search(sxe, title) or (airdate_pattern and re.search(airdate_pattern, title)):
                         return scraper_utils.pathify_url(url)
                 else:
                     if title_fallback and norm_title:
                         match = re.search('</strong>(.*?)</p>', post)
                         if match and norm_title == scraper_utils.normalize_title(match.group(1)):
                             return scraper_utils.pathify_url(url)
             
         page_url = dom_parser.parse_dom(html, 'a', {'class': 'nextpostslink'}, ret='href')

Ejemplo n.º 35

0

Mostrar archivo

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url,
                                      '/index.php?menu=search&query=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=.25)
        results = []
        sections = {VIDEO_TYPES.MOVIE: 'movies', VIDEO_TYPES.TVSHOW: 'series'}

        fragment = dom_parser.parse_dom(html, 'div',
                                        {'id': sections[video_type]})
        if fragment:
            for item in dom_parser.parse_dom(fragment[0], 'figcaption'):
                match = re.search('title="([^"]+)[^>]+href="([^"]+)', item)
                if match:
                    match_title_year, url = match.groups()
                    match = re.search('(.*?)\s+\(?(\d{4})\)?',
                                      match_title_year)
                    if match:
                        match_title, match_year = match.groups()
                    else:
                        match_title = match_title_year
                        match_year = ''
                    if match_title.startswith('Watch '):
                        match_title = match_title.replace('Watch ', '')
                    if match_title.endswith(' Online'):
                        match_title = match_title.replace(' Online', '')

                    if not year or not match_year or year == match_year:
                        result = {
                            'title': match_title,
                            'url': self._pathify_url(url),
                            'year': match_year
                        }
                        results.append(result)
        return results

Ejemplo n.º 36

0

Mostrar archivo

Archivo: moviewatcher_scraper.py Proyecto: AMOboxTV/AMOBox.LegoBuild

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, '/search?q=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=1)
        norm_title = scraper_utils.normalize_title(title)
        for item in dom_parser.parse_dom(html, 'div', {'class': 'video_item'}):
            match_url = dom_parser.parse_dom(item, 'a', ret='href')
            match_title = dom_parser.parse_dom(item, 'img', ret='alt')
            match_year = ''
            if match_url and match_title:
                match_url = match_url[0]
                match_title = match_title[0]
                if VIDEO_TYPES == VIDEO_TYPES.TVSHOW and '/tv-series/' not in match_url:
                    continue

                if match_year:
                    match_year = match_year[0]
                else:
                    match_year = ''

                if norm_title in scraper_utils.normalize_title(
                        match_title) and (not year or not match_year
                                          or year == match_year):
                    result = {
                        'url': scraper_utils.pathify_url(match_url),
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year
                    }
                    results.append(result)

        return results

Ejemplo n.º 37

0

Mostrar archivo

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(
            self.base_url, '/search.php?q=%s&limit=20&timestamp=%s' %
            (urllib.quote_plus(title), time.time()))
        html = self._http_get(search_url, cache_limit=.25)
        results = []
        items = dom_parser.parse_dom(html, 'li')
        if len(items) >= 2:
            items = items[1:]
            for item in items:
                url = dom_parser.parse_dom(item, 'a', ret='href')
                match_title_year = dom_parser.parse_dom(item, 'strong')
                if url and match_title_year:
                    url = url[0]
                    match_title_year = match_title_year[0].replace(
                        '<strong>', '').replace('</strong>', '')
                    match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)',
                                      match_title_year)
                    if match:
                        match_title, match_year = match.groups()
                    else:
                        match_title = match_title_year
                        match_year = ''

                    result = {
                        'title': match_title,
                        'year': match_year,
                        'url': url.replace(self.base_url, '')
                    }
                    results.append(result)
        return results

Ejemplo n.º 38

0

Mostrar archivo

Archivo: moviexk_scraper.py Proyecto: AMOboxTV/AMOBox.LegoBuild

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, '/search/')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=1)
        for fragment in dom_parser.parse_dom(html, 'div', {'class': 'inner'}):
            name = dom_parser.parse_dom(fragment, 'div', {'class': 'name'})
            if name:
                match = re.search('href="([^"]+)[^>]+>(.*?)</a>', name[0])
                if match:
                    match_url, match_title_year = match.groups()
                    if 'tv-series' in match_url and video_type == VIDEO_TYPES.MOVIE: continue
                    
                    match_title_year = re.sub('</?[^>]*>', '', match_title_year)
                    match_title_year = re.sub('[Ww]atch\s+[Mm]ovie\s*', '', match_title_year)
                    match_title_year = match_title_year.replace('&#8217;', "'")
                    match = re.search('(.*?)\s+\((\d{4})[^)]*\)$', match_title_year)
                    if match:
                        match_title, match_year = match.groups()
                    else:
                        match_title = match_title_year
                        match_year = ''
    
                    if not match_year:
                        year_span = dom_parser.parse_dom(fragment, 'span', {'class': 'year'})
                        if year_span:
                            year_text = dom_parser.parse_dom(year_span[0], 'a')
                            if year_text:
                                match_year = year_text[0].strip()
    
                    if not year or not match_year or year == match_year:
                        result = {'title': scraper_utils.cleanse_title(match_title), 'url': scraper_utils.pathify_url(match_url), 'year': match_year}
                        results.append(result)

        return results

Ejemplo n.º 39

0

Mostrar archivo

Archivo: dizifilmhd_scraper.py Proyecto: kevintone/tdbaddon

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, '/?s=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=8)
        title_strip = [word.decode('utf-8') for word in TITLE_STRIP]
        for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}):
            match_url = re.search('href="([^"]+)', item)
            match_title = dom_parser.parse_dom(item, 'span', {'class': 'tt'})
            if match_url and match_title:
                item_type = dom_parser.parse_dom(item, 'span', {'class': 'calidad2'})
                if item_type and item_type[0] in SEARCH_EXCLUDE: continue
                match_url = match_url.group(1)
                match_title = match_title[0]
                if 'SEZON' in match_title.upper(): continue

                year_frag = dom_parser.parse_dom(item, 'span', {'class': 'year'})
                if year_frag:
                    match_year = year_frag[0]
                else:
                    match_year = ''
                        
                match_title = ' '.join([word for word in match_title.split() if word.upper() not in title_strip])
                if (not year or not match_year or year == match_year):
                    result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                    results.append(result)
        
        return results

Ejemplo n.º 40

0

Mostrar archivo

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            page_url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(page_url, cache_limit=.25)
            for link in dom_parser.parse_dom(html, 'div', {'class': '[^"]*ldr-item[^"]*'}):
                stream_url = dom_parser.parse_dom(link, 'a', ret='data-actuallink')
                
                views = None
                watched = dom_parser.parse_dom(link, 'div', {'class': 'click-count'})
                if watched:
                    match = re.search(' (\d+) ', watched[0])
                    if match:
                        views = match.group(1)
                        
                score = dom_parser.parse_dom(link, 'div', {'class': '\s*point\s*'})
                if score:
                    score = int(score[0])
                    rating = score * 10 if score else None
                
                if stream_url:
                    stream_url = stream_url[0]
                    host = urlparse.urlparse(stream_url).hostname
                    quality = scraper_utils.get_quality(video, host, QUALITIES.HIGH)
                    hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': views, 'rating': rating, 'url': stream_url, 'direct': False}
                    hosters.append(hoster)

        return hosters

Ejemplo n.º 41

0

Mostrar archivo

Archivo: moviewatcher_scraper.py Proyecto: freeworldxbmc/KAOSbox-Repo

 def get_sources(self, video):
     source_url = self.get_url(video)
     hosters = []
     if source_url and source_url != FORCE_NO_MATCH:
         page_url = urlparse.urljoin(self.base_url, source_url)
         html = self._http_get(page_url, cache_limit=.25)
         for item in dom_parser.parse_dom(html, 'div', {'class': 'stream-table__row'}):
             stream_url = dom_parser.parse_dom(item, 'a', ret='href')
                 
             match = re.search('Views:\s*(?:</span>)?\s*(\d+)', item, re.I)
             if match:
                 views = match.group(1)
             else:
                 views = None
                 
             match = re.search('Size:\s*(?:</span>)?\s*(\d+)', item, re.I)
             if match:
                 size = int(match.group(1)) * 1024 * 1024
             else:
                 size = None
                 
             if stream_url:
                 stream_url = stream_url[0]
                 match = re.search('/redirect/(.*)', stream_url)
                 if match:
                     stream_url = base64.decodestring(urllib.unquote(match.group(1)))
                     
                 host = urlparse.urlparse(stream_url).hostname
                 if host:
                     quality = scraper_utils.get_quality(video, host, QUALITIES.HIGH)
                     hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': views, 'rating': None, 'url': stream_url, 'direct': False}
                     if size is not None: hoster['size'] = scraper_utils.format_size(size, 'B')
                     hosters.append(hoster)
     return hosters

Ejemplo n.º 42

0

Mostrar archivo

    def _get_episode_url(self, show_url, video):
        url = urlparse.urljoin(self.base_url, show_url)
        html = self._http_get(url, cache_limit=2)
        if html:
            force_title = scraper_utils.force_title(video)
            episodes = dom_parser.parse_dom(html, 'div', {'class': '\s*el-item\s*'})
            if not force_title:
                episode_pattern = 'href="([^"]*-[sS]%02d[eE]%02d(?!\d)[^"]*)' % (int(video.season), int(video.episode))
                match = re.search(episode_pattern, html)
                if match:
                    return scraper_utils.pathify_url(match.group(1))
                
                if kodi.get_setting('airdate-fallback') == 'true' and video.ep_airdate:
                    airdate_pattern = '%02d-%02d-%d' % (video.ep_airdate.day, video.ep_airdate.month, video.ep_airdate.year)
                    for episode in episodes:
                        ep_url = dom_parser.parse_dom(episode, 'a', ret='href')
                        ep_airdate = dom_parser.parse_dom(episode, 'div', {'class': 'date'})
                        if ep_url and ep_airdate:
                            ep_airdate = ep_airdate[0].strip()
                            if airdate_pattern == ep_airdate:
                                return scraper_utils.pathify_url(ep_url[0])

            if (force_title or kodi.get_setting('title-fallback') == 'true') and video.ep_title:
                norm_title = scraper_utils.normalize_title(video.ep_title)
                for episode in episodes:
                    ep_url = dom_parser.parse_dom(episode, 'a', ret='href')
                    ep_title = dom_parser.parse_dom(episode, 'div', {'class': 'e-name'})
                    if ep_url and ep_title and norm_title == scraper_utils.normalize_title(ep_title[0]):
                        return scraper_utils.pathify_url(ep_url[0])

Ejemplo n.º 43

0

Mostrar archivo

Archivo: watchepisodes_scraper.py Proyecto: monicarero/repository.xvbmc

    def _get_episode_url(self, show_url, video):
        url = urlparse.urljoin(self.base_url, show_url)
        html = self._http_get(url, cache_limit=2)
        if html:
            force_title = scraper_utils.force_title(video)
            episodes = dom_parser.parse_dom(html, 'div', {'class': '\s*el-item\s*'})
            if not force_title:
                episode_pattern = 'href="([^"]*-[sS]%02d[eE]%02d(?!\d)[^"]*)' % (int(video.season), int(video.episode))
                match = re.search(episode_pattern, html)
                if match:
                    return scraper_utils.pathify_url(match.group(1))
                
                if kodi.get_setting('airdate-fallback') == 'true' and video.ep_airdate:
                    airdate_pattern = '%02d-%02d-%d' % (video.ep_airdate.day, video.ep_airdate.month, video.ep_airdate.year)
                    for episode in episodes:
                        ep_url = dom_parser.parse_dom(episode, 'a', ret='href')
                        ep_airdate = dom_parser.parse_dom(episode, 'div', {'class': 'date'})
                        if ep_url and ep_airdate:
                            ep_airdate = ep_airdate[0].strip()
                            if airdate_pattern == ep_airdate:
                                return scraper_utils.pathify_url(ep_url[0])

            if (force_title or kodi.get_setting('title-fallback') == 'true') and video.ep_title:
                norm_title = scraper_utils.normalize_title(video.ep_title)
                for episode in episodes:
                    ep_url = dom_parser.parse_dom(episode, 'a', ret='href')
                    ep_title = dom_parser.parse_dom(episode, 'div', {'class': 'e-name'})
                    if ep_url and ep_title and norm_title == scraper_utils.normalize_title(ep_title[0]):
                        return scraper_utils.pathify_url(ep_url[0])

Ejemplo n.º 44

0

Mostrar archivo

 def _get_episode_url(self, show_url, video):
     url = urlparse.urljoin(self.base_url, show_url)
     html = self._http_get(url, cache_limit=1)
     data_id = dom_parser.parse_dom(html,
                                    'div', {'id': 'dizidetay'},
                                    ret='data-id')
     data_dizi = dom_parser.parse_dom(html,
                                      'div', {'id': 'dizidetay'},
                                      ret='data-dizi')
     if data_id and data_dizi:
         queries = {
             'sekme': 'bolumler',
             'id': data_id[0],
             'dizi': data_dizi[0]
         }
         season_url = SEASON_URL + '?' + urllib.urlencode(queries)
         episode_pattern = '''href=['"]([^'"]*/%s-sezon-%s-[^'"]*bolum[^'"]*)''' % (
             video.season, video.episode)
         title_pattern = '''href=['"](?P<url>[^'"]+)[^>]*>(?P<title>[^<]+)'''
         airdate_pattern = '''href=['"]([^"']+)[^>]*>[^<]*</a>\s*</td>\s*<td class="right aligned">{p_day}\.{p_month}\.{year}'''
         headers = XHR
         headers['Content-Length'] = 0
         headers['Referer'] = url
         result = self._default_get_episode_url(season_url,
                                                video,
                                                episode_pattern,
                                                title_pattern,
                                                airdate_pattern,
                                                headers=headers,
                                                method='POST')
         if result and 'javascript:;' not in result:
             return result

Ejemplo n.º 45

0

Mostrar archivo

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            url = urlparse.urljoin(self.base_url, source_url)
            entry = ''
            while True:
                html = self._http_get(url, cache_limit=.5)
                if not html:
                    url = urlparse.urljoin(BASE_URL2, source_url)
                    html = self._http_get(url, cache_limit=.5)
                entry = dom_parser.parse_dom(html, 'div', {'class': 'entry'})
                if entry:
                    entry = entry[0]
                    match = re.search('Watch it here\s*:.*?href="([^"]+)', entry, re.I)
                    if match:
                        url = match.group(1)
                    else:
                        break
                else:
                    entry = ''
                    break
    
            for tab in dom_parser.parse_dom(entry, 'div', {'class': '''[^'"]*postTabs_divs[^'"]*'''}):
                match = re.search('<iframe[^>]*src="([^"]+)', tab, re.I | re.DOTALL)
                if match:
                    link = match.group(1)
                    host = urlparse.urlparse(link).hostname
                    hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': scraper_utils.get_quality(video, host, QUALITIES.HIGH), 'views': None, 'rating': None, 'url': link, 'direct': False}
                    hosters.append(hoster)

        return hosters

Ejemplo n.º 46

0

Mostrar archivo

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, SEARCH_URL)
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=8)
        fragment = dom_parser.parse_dom(html, 'div',
                                        {'class': '[^"]*items[^"]*'})
        if fragment:
            for item in dom_parser.parse_dom(fragment[0], 'div',
                                             {'class': 'item'}):
                match_url = dom_parser.parse_dom(item,
                                                 'a', {'class': 'header'},
                                                 ret='href')
                match_title_year = dom_parser.parse_dom(
                    item, 'a', {'class': 'header'})
                if match_url and match_title_year:
                    match_url = match_url[0]
                    match_title_year = match_title_year[0]
                    r = re.search('(.*?)\s+\((\d{4})\)', match_title_year)
                    if r:
                        match_title, match_year = r.groups()
                    else:
                        match_title = match_title_year
                        match_year = ''

                    if not year or not match_year or year == match_year:
                        result = {
                            'url': scraper_utils.pathify_url(match_url),
                            'title': scraper_utils.cleanse_title(match_title),
                            'year': match_year
                        }
                        results.append(result)

        return results

Ejemplo n.º 47

0

Mostrar archivo

Archivo: ddlvalley_scraper.py Proyecto: monicarero/repository.xvbmc

 def _get_episode_url(self, show_url, video):
     sxe = '.S%02dE%02d.' % (int(video.season), int(video.episode))
     force_title = scraper_utils.force_title(video)
     title_fallback = kodi.get_setting('title-fallback') == 'true'
     norm_title = scraper_utils.normalize_title(video.ep_title)
     try: ep_airdate = video.ep_airdate.strftime('.%Y.%m.%d.')
     except: ep_airdate = ''
     
     page_url = [show_url]
     too_old = False
     while page_url and not too_old:
         url = urlparse.urljoin(self.base_url, page_url[0])
         html = self._http_get(url, require_debrid=True, cache_limit=1)
         headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html)
         posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'})
         for heading, post in zip(headings, posts):
             if self.__too_old(post):
                 too_old = True
                 break
             if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post:
                 url, title = heading
                 if not force_title:
                     if (sxe in title) or (ep_airdate and ep_airdate in title):
                         return scraper_utils.pathify_url(url)
                 else:
                     if title_fallback and norm_title:
                         match = re.search('<strong>(.*?)</strong>', post)
                         if match and norm_title == scraper_utils.normalize_title(match.group(1)):
                             return scraper_utils.pathify_url(url)
             
         page_url = dom_parser.parse_dom(html, 'a', {'class': 'nextpostslink'}, ret='href')

Ejemplo n.º 48

0

Mostrar archivo

Archivo: sezonlukdizi_scraper.py Proyecto: c0ns0le/YCBuilds

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, SEARCH_URL)
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=8)
        fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*items[^"]*'})
        if fragment:
            for item in dom_parser.parse_dom(fragment[0], 'div', {'class': 'item'}):
                match_url = dom_parser.parse_dom(item, 'a', {'class': 'header'}, ret='href')
                match_title_year = dom_parser.parse_dom(item, 'a', {'class': 'header'})
                if match_url and match_title_year:
                    match_url = match_url[0]
                    match_title_year = match_title_year[0]
                    r = re.search('(.*?)\s+\((\d{4})\)', match_title_year)
                    if r:
                        match_title, match_year = r.groups()
                    else:
                        match_title = match_title_year
                        match_year = ''
                    
                    if not year or not match_year or year == match_year:
                        result = {'url': scraper_utils.pathify_url(match_url), 'title': match_title, 'year': match_year}
                        results.append(result)

        return results

Ejemplo n.º 49

0

Mostrar archivo

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, '/?s=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=1)
        for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}):
            match = re.search('href="([^"]+)', item)
            if match:
                url = match.group(1)
                match_title_year = dom_parser.parse_dom(
                    item, 'span', {'class': 'tt'})
                if match_title_year:
                    match = re.search('(.*?)\s+\(?(\d{4})\)?',
                                      match_title_year[0])
                    if match:
                        match_title, match_year = match.groups()
                    else:
                        match_title = match_title_year[0]
                        match_year = ''

                    year_frag = dom_parser.parse_dom(item, 'span',
                                                     {'class': 'year'})
                    if year_frag:
                        match_year = year_frag[0]

                    if (not year or not match_year or year == match_year):
                        result = {
                            'url': scraper_utils.pathify_url(url),
                            'title': scraper_utils.cleanse_title(match_title),
                            'year': match_year
                        }
                        results.append(result)

        return results

Ejemplo n.º 50

0

Mostrar archivo

Archivo: putmv_scraper.py Proyecto: kevintone/tdbaddon

 def search(self, video_type, title, year, season=''):
     search_url = urlparse.urljoin(self.base_url, '/search/%s.html' % urllib.quote_plus(title))
     html = self._http_get(search_url, cache_limit=.25)
     results = []
     fragment = dom_parser.parse_dom(html, 'div', {'class': 'list-movie'})
     if fragment:
         for item in dom_parser.parse_dom(fragment[0], 'div', {'class': 'movie'}):
             match = re.search('class="movie-name".*?href="([^"]+)[^>]+>([^<]+)', item)
             if match:
                 url, match_title = match.groups()
                 is_season = re.search('\s+-\s+[Ss](\d+)$', match_title)
                 if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON:
                     match_year = ''
                     if video_type == VIDEO_TYPES.MOVIE:
                         for info_frag in dom_parser.parse_dom(item, 'p', {'class': 'info'}):
                             match = re.search('(\d{4})', info_frag)
                             if match:
                                 match_year = match.group(1)
                                 break
                         
                         if not match_year:
                             match = re.search('(\d{4})$', url)
                             if match:
                                 match_year = match.group(1)
                     else:
                         if season and int(is_season.group(1)) != int(season):
                             continue
                             
                     if (not year or not match_year or year == match_year):
                         result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                         results.append(result)
     
     return results

Ejemplo n.º 51

0

Mostrar archivo

Archivo: pctf_scraper.py Proyecto: ayadmustafa/tknorris-beta-repo

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, "/?query=")
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=0.25)
        results = []
        info = dom_parser.parse_dom(html, "div", {"class": "movie-info"})
        for item in info:
            match_title = dom_parser.parse_dom(item, "span", {"class": "movie-title"})
            match_year = dom_parser.parse_dom(item, "span", {"class": "movie-year"})
            if match_title:
                match_title = self.__strip_link(match_title[0])
                if match_year:
                    match_year = self.__strip_link(match_year[0])
                else:
                    match_year = ""

                match = re.search('href="([^"]+)', item)
                if match:
                    url = match.group(1)
                else:
                    continue

                if not year or not match_year or year == match_year:
                    result = {"title": match_title, "year": match_year, "url": url.replace(self.base_url, "")}
                    results.append(result)

        return results

Ejemplo n.º 52

0

Mostrar archivo

Archivo: moviestorm_scraper.py Proyecto: freeworldxbmc/KAOSbox-Repo

    def search(self, video_type, title, year, season=''):
        results = []
        if video_type == VIDEO_TYPES.TVSHOW:
            url = urlparse.urljoin(self.base_url, '/series/all/')
            html = self._http_get(url, cache_limit=8)
    
            links = dom_parser.parse_dom(html, 'a', {'class': 'underilne'}, 'href')
            titles = dom_parser.parse_dom(html, 'a', {'class': 'underilne'})
            items = zip(links, titles)
        else:
            url = urlparse.urljoin(self.base_url, '/search?=%s' % urllib.quote_plus(title))
            data = {'q': title, 'go': 'Search'}
            html = self._http_get(url, data=data, cache_limit=8)
            match = re.search('you can search again in (\d+) seconds', html, re.I)
            if match:
                wait = int(match.group(1))
                if wait > self.timeout: wait = self.timeout
                time.sleep(wait)
                html = self._http_get(url, data=data, cache_limit=0)
                
            pattern = 'class="movie_box.*?href="([^"]+).*?<h1>([^<]+)'
            items = re.findall(pattern, html, re.DOTALL)

        norm_title = scraper_utils.normalize_title(title)
        for item in items:
            url, match_title = item
            if norm_title in scraper_utils.normalize_title(match_title):
                result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''}
                results.append(result)

        return results

Ejemplo n.º 53

0

Mostrar archivo

 def search(self, video_type, title, year):
     results = []
     norm_title = self._normalize_title(title)
     if video_type == VIDEO_TYPES.TVSHOW:
         for server_url in TVSHOW_URLS:
             for row in self.__parse_directory(self._http_get(server_url, cache_limit=48)):
                 match_year = ''
                 if norm_title in self._normalize_title(row['title']) and (not year or not match_year or year == match_year):
                     result = {'url': urlparse.urljoin(server_url, row['link']), 'title': row['title'], 'year': match_year}
                     results.append(result)
     else:
         search_url = urlparse.urljoin(self.base_url, '/?s=')
         search_url += urllib.quote_plus(title)
         html = self._http_get(search_url, cache_limit=1)
         for article in dom_parser.parse_dom(html, 'article', {'class': 'entry-body'}):
             link = dom_parser.parse_dom(article, 'a', {'class': 'more-link'}, 'href')
             content = dom_parser.parse_dom(article, 'div', {'class': 'post-content'})
             match = re.search('</a>\s*([^<]+)', content[0]) if content else ''
             info = dom_parser.parse_dom(article, 'div', {'class': 'post-info'})
             is_movie = re.search('/category/movies/', info[0]) if info else False
             if match and link and is_movie:
                 match_title_year = match.group(1)
                 match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year)
                 if match:
                     match_title, match_year = match.groups()
                 else:
                     match_title = match_title_year
                     match_year = ''
                 
                 if not year or not match_year or year == match_year:
                     result = {'url': self._pathify_url(link[0]), 'title': match_title, 'year': match_year}
                     results.append(result)
     
     return results

Ejemplo n.º 54

0

Mostrar archivo

    def search(self, video_type, title, year):
        search_url = urlparse.urljoin(self.base_url, '/?s=')
        search_url += urllib.quote_plus('%s %s' % (title, year))
        html = self._http_get(search_url, cache_limit=.25)
        results = []
        for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}):
            match = re.search('href="([^"]+).*?alt="([^"]+)', item, re.DOTALL)
            if match:
                url, match_title_year = match.groups()
                match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)',
                                  match_title_year)
                if match:
                    match_title, match_year = match.groups()
                else:
                    match_title = match_title_year
                    year_fragment = dom_parser.parse_dom(
                        item, 'span', {'class': 'year'})
                    if year_fragment:
                        match_year = year_fragment[0]
                    else:
                        match_year = ''

                if not year or not match_year or year == match_year:
                    result = {
                        'url': scraper_utils.pathify_url(url),
                        'title': match_title,
                        'year': match_year
                    }
                    results.append(result)
        return results