Python cleanse_title Examples, dsrd_lib.scraper_utils.cleanse_title Python Examples

Example #1

0

Show file

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        if video_type == VIDEO_TYPES.TVSHOW and title:
            test_url = '/category/tv-shows/' % (scraper_utils.to_slug(title))
            test_url = scraper_utils.urljoin(self.base_url, test_url)
            html = self._http_get(test_url,
                                  require_debrid=True,
                                  cache_limit=24)
            posts = dom_parser2.parse_dom(html, 'div',
                                          {'id': re.compile('post-\d+')})
            if posts:
                result = {
                    'url': scraper_utils.pathify_url(test_url),
                    'title': scraper_utils.cleanse_title(title),
                    'year': ''
                }
                results.append(result)
        elif video_type == VIDEO_TYPES.MOVIE:
            search_title = re.sub(
                '[/forum/7-1080p-720p-high-definition-movies/]', '',
                title.lower())
            html = self._http_get(self.base_url,
                                  params={'s': search_title},
                                  require_debrid=True,
                                  cache_limit=1)
            norm_title = scraper_utils.normalize_title(title)
            for _attrs, post in dom_parser2.parse_dom(
                    html, 'div', {'id': re.compile('post-\d+')}):
                match = re.search(
                    '<h\d+[^>]*>\s*<a\s+href="([^"]+)[^>]*>(.*?)</a>', post)
                if match:
                    post_url, post_title = match.groups()
                    if '/tv-show/' in post or self.__too_old(post): continue
                    post_title = re.sub('<[^>]*>', '', post_title)
                    meta = scraper_utils.parse_movie_link(post_title)
                    full_title = '%s [%s] (%sp)' % (
                        meta['title'], meta['extra'], meta['height'])
                    match_year = meta['year']

                    match_norm_title = scraper_utils.normalize_title(
                        meta['title'])
                    if (match_norm_title in norm_title or norm_title
                            in match_norm_title) and (not year
                                                      or not match_year
                                                      or year == match_year):
                        result = {
                            'url': scraper_utils.pathify_url(post_url),
                            'title': scraper_utils.cleanse_title(full_title),
                            'year': match_year
                        }
                        results.append(result)

        return results

Example #2

0

Show file

File: 2ddl_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = '/search/' + urllib.quote_plus(title)
        html = self._http_get(search_url, require_debrid=False, cache_limit=1)
        if video_type == VIDEO_TYPES.TVSHOW:
            seen_urls = {}
            for _attr, post in dom_parser2.parse_dom(
                    html, 'div', {'id': re.compile('post-\d+')}):
                if CATEGORIES[video_type] not in post: continue
                match = re.search(
                    '<span>\s*TAGS:\s*</span>\s*<a\s+href="([^"]+)[^>]+>([^<]+)',
                    post, re.I)
                if match:
                    show_url, match_title = match.groups()
                    if show_url in seen_urls: continue
                    result = {
                        'url': scraper_utils.pathify_url(show_url),
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': ''
                    }
                    seen_urls[show_url] = result
                    results.append(result)
        elif video_type == VIDEO_TYPES.MOVIE:
            norm_title = scraper_utils.normalize_title(title)
            headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>',
                                  html)
            posts = [
                result.content for result in dom_parser2.parse_dom(
                    html, 'div', {'id': re.compile('post-\d+')})
            ]
            for heading, post in zip(headings, posts):
                if CATEGORIES[video_type] not in post or self.__too_old(post):
                    continue
                post_url, post_title = heading
                meta = scraper_utils.parse_movie_link(post_title)
                full_title = '%s [%s] (%sp)' % (meta['title'], meta['extra'],
                                                meta['height'])
                match_year = meta['year']

                match_norm_title = scraper_utils.normalize_title(meta['title'])
                if (match_norm_title in norm_title or norm_title
                        in match_norm_title) and (not year or not match_year
                                                  or year == match_year):
                    result = {
                        'url': scraper_utils.pathify_url(post_url),
                        'title': scraper_utils.cleanse_title(full_title),
                        'year': match_year
                    }
                    results.append(result)

        return results

Example #3

0

Show file

File: dizibox_scraper.py Project: Lhse44/repository.deallen

 def __get_ok(self, embed, flashvars):
     hosters = []
     link = flashvars[0].attrs['value']
     match = re.search('metadataUrl=([^"]+)', link)
     if match:
         referer = scraper_utils.cleanse_title(urllib.unquote(embed[0].attrs['data']))
         ok_url = scraper_utils.cleanse_title(urllib.unquote(match.group(1)))
         html = self._http_get(ok_url, data='ok', headers={'Referer': referer}, cache_limit=.25)
         js_data = scraper_utils.parse_json(html, ok_url)
         stream_url = js_data.get('movie', {}).get('url')
         if stream_url is not None:
             host = urlparse.urlparse(stream_url).hostname
             hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': QUALITIES.HD720, 'views': None, 'rating': None, 'url': stream_url, 'direct': False, 'subs': 'Turkish Subtitles'}
             hosters.append(hoster)
     return hosters

Example #4

0

Show file

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url,
                                           '/search/searchBoxSuggestion')
        html = self._http_get(search_url,
                              params={
                                  'top': 8,
                                  'query': title
                              },
                              cache_limit=8)
        js_data = scraper_utils.parse_json(html, search_url)
        for item in js_data:
            entityName = match_title_year = item.get('Value', '')
            if entityName:
                match_title, match_year2 = scraper_utils.extra_year(
                    match_title_year)
                match_year = str(item.get('ReleaseYear', ''))
                if not match_year: match_year = match_year2

                match_url = '/ontology/EntityDetails?' + urllib.urlencode(
                    {
                        'entityName': entityName,
                        'ignoreMediaLinkError': 'false'
                    })
                if not year or not match_year or year == match_year:
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': scraper_utils.pathify_url(match_url)
                    }
                    results.append(result)

        return results

Example #5

0

Show file

 def search(self, video_type, title, year, season=''):
     scrape = title.lower().replace(' ','+').replace(':', '')
     search_url = urlparse.urljoin(self.base_url, '/movie/search/')
     search_url += urllib.quote_plus(title)
     html = self._http_get(search_url, cache_limit=.25)
     results = []
     sections = {VIDEO_TYPES.MOVIE: 'movies', VIDEO_TYPES.TVSHOW: 'series'}
     
     fragment = dom_parser.parse_dom(html, 'div', {'id': sections[video_type]})
     if fragment:
         for item in dom_parser.parse_dom(fragment[0], 'figcaption'):
             match = re.search('title="([^"]+)[^>]+href="([^"]+)', item)
             if match:
                 match_title_year, url = match.groups()
                 match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year)
                 if match:
                     match_title, match_year = match.groups()
                 else:
                     match_title = match_title_year
                     url = urlparse.urljoin(match_title.group(1), 'watching.html')
                     match_year = ''
                 if match_title.startswith('Watch '): match_title = match_title.replace('Watch ', '')
                 if match_title.endswith(' Online'): match_title = match_title.replace(' Online', '')
                 
                 if not year or not match_year or year == match_year:
                     result = {'title': scraper_utils.cleanse_title(match_title), 'year': scraper_utils.pathify_url(url), 'url': match_year}
                     results.append(result)
     return results

Example #6

0

Show file

    def search(self, video_type, title, year, season=''):
        if video_type == VIDEO_TYPES.MOVIE:
            is_series = 1
        else:
            is_series = 2
        search_url = urlparse.urljoin(
            self.base_url,
            '/advanced-search/?q[title]=%s&q[is_series]=%s&q[year_from]=%s&q[year_to]=%s'
        )
        search_url = search_url % (urllib.quote_plus(title), is_series, year,
                                   year)

        results = []
        html = self._http_get(search_url, cache_limit=.25)
        if not re.search('Nothing was found', html):
            for match in re.finditer(
                    'class="name">\s*<a\s+title="([^"]+)\s+\((\d{4})\)"\s+href="([^"]+)',
                    html):
                title, year, url = match.groups('')
                if re.search('/season-\d+/episode-\d+', url):
                    continue  # exclude episodes
                result = {
                    'url': scraper_utils.pathify_url(url),
                    'title': scraper_utils.cleanse_title(title),
                    'year': year
                }
                results.append(result)
        return results

Example #7

0

Show file

File: hevcbluray_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        html = self._http_get(self.base_url,
                              params={'s': title},
                              cache_limit=8)
        for _attrs, item in dom_parser2.parse_dom(html, 'div',
                                                  {'class': 'cover'}):
            match = dom_parser2.parse_dom(item, 'a', req=['href', 'title'])
            if not match: continue

            match_url, match_title_year = match[0].attrs['href'], match[
                0].attrs['title']
            if re.search('S\d+E\d+', match_title_year, re.I): continue
            match_title, match_year = scraper_utils.extra_year(
                match_title_year)
            if not match_year:
                match = re.search('-(\d{4})-', match_url)
                if match:
                    match_year = match.group(1)

            if not year or not match_year or year == match_year:
                result = {
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year,
                    'url': scraper_utils.pathify_url(match_url)
                }
                results.append(result)

        return results

Example #8

0

Show file

    def search(self, video_type, title, year, season=''):
        search_url = urlparse.urljoin(self.base_url, '/search/%s.html')
        search_url = search_url % (urllib.quote_plus(title))
        html = self._http_get(search_url, cache_limit=8)
        results = []
        for thumb in dom_parser.parse_dom(html, 'div', {'class': 'thumb'}):
            match_title = dom_parser.parse_dom(thumb,
                                               'a', {'class': 'clip-link'},
                                               ret='title')
            url = dom_parser.parse_dom(thumb,
                                       'a', {'class': 'clip-link'},
                                       ret='href')
            if match_title and url:
                match_title, url = match_title[0], url[0]
                is_season = re.search('Season\s+(\d+)$', match_title, re.I)
                if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (
                        is_season and video_type == VIDEO_TYPES.SEASON):
                    match_year = ''
                    if video_type == VIDEO_TYPES.MOVIE:
                        match_year = dom_parser.parse_dom(
                            thumb, 'div', {'class': '[^"]*status-year[^"]*'})
                        if match_year:
                            match_year = match_year[0]
                    else:
                        if season and int(is_season.group(1)) != int(season):
                            continue

                    if not year or not match_year or year == match_year:
                        result = {
                            'url': scraper_utils.pathify_url(url),
                            'title': scraper_utils.cleanse_title(match_title),
                            'year': match_year
                        }
                        results.append(result)
        return results

Example #9

0

Show file

File: movietube_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/index.php')
        data = {'subaction': 'search', 'story': title, 'do': 'search'}
        headers = {'Referer': search_url}
        html = self._http_get(search_url,
                              params={'do': 'search'},
                              data=data,
                              headers=headers,
                              cache_limit=1)
        fragment = dom_parser2.parse_dom(html, 'div', {'id': 'dle-content'})
        if not fragment: return results

        for _attrs, item in dom_parser2.parse_dom(fragment[0].content, 'div',
                                                  {'class': 'short-film'}):
            match = re.search('<h5><a\s+href="([^"]+)[^>]+title="([^"]+)',
                              item)
            if not match: continue

            url, match_title = match.groups('')
            result = {
                'url': scraper_utils.pathify_url(url),
                'title': scraper_utils.cleanse_title(match_title),
                'year': ''
            }
            results.append(result)

        return results

Example #10

0

Show file

File: merb.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):
        search_url = self.base_url
        if video_type in [VIDEO_TYPES.TVSHOW, VIDEO_TYPES.EPISODE]:
            search_url += '/?tv'

        search_url += '/index.php?advanced_search='
        search_url += urllib.quote_plus(title)
        search_url += '&year=' + urllib.quote_plus(str(year))
        search_url += '&advanced_search=Search'

        html = self._http_get(search_url, cache_limit=.25)
        results = []
        for element in dom_parser.parse_dom(html, 'div',
                                            {'class': 'list_box_title'}):
            match = re.search('href="([^"]+)"\s+title="(?:Watch )?([^"]+)',
                              element)
            if match:
                url, match_title_year = match.groups()
                match = re.search('(.*?)(?:\s+\(?\s*(\d{4})\s*\)?)',
                                  match_title_year)
                if match:
                    match_title, match_year = match.groups()
                else:
                    match_title = match_title_year
                    match_year = ''

                if not year or not match_year or year == match_year:
                    result = {
                        'url': scraper_utils.pathify_url(url),
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year
                    }
                    results.append(result)
        return results

Example #11

0

Show file

File: rmz_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search/')
        search_url = scraper_utils.urljoin(search_url,
                                           urllib.quote_plus(title))
        html = self._http_get(search_url, require_debrid=True, cache_limit=8)
        for _attrs, fragment in dom_parser2.parse_dom(html, 'div',
                                                      {'class': 'list'}):
            if not dom_parser2.parse_dom(fragment, 'div',
                                         {'class': 'lists_titles'}):
                continue
            for attrs, match_title_year in dom_parser2.parse_dom(
                    fragment, 'a', {'class': 'title'}, req='href'):
                match_url = attrs['href']
                match_title_year = re.sub('</?[^>]*>', '', match_title_year)
                is_show = re.search('\(d{4|-\)', match_title_year)
                if (is_show and video_type == VIDEO_TYPES.MOVIE) or (
                        not is_show and video_type == VIDEO_TYPES.TVSHOW):
                    continue

                match_title, match_year = scraper_utils.extra_year(
                    match_title_year)
                if not year or not match_year or year == match_year:
                    result = {
                        'url': scraper_utils.pathify_url(match_url),
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year
                    }
                    results.append(result)

        return results

Example #12

0

Show file

File: onlinedizi_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        seen_urls = set()
        search_url = scraper_utils.urljoin(self.base_url, '/search/')
        html = self._http_get(search_url, cache_limit=48)
        norm_title = scraper_utils.normalize_title(title)
        for _attrs, item in dom_parser2.parse_dom(html, 'div',
                                                  {'class': 'category-post'}):
            match_url = dom_parser2.parse_dom(item, 'a', req='href')
            match_title = dom_parser2.parse_dom(item, 'h3')
            if match_url and match_title:
                match_url = scraper_utils.pathify_url(
                    match_url[0].attrs['href'])
                match_title = match_title[0].content
                if match_url in seen_urls: continue
                seen_urls.add(match_url)
                if norm_title in scraper_utils.normalize_title(match_title):
                    result = {
                        'url': match_url,
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': ''
                    }
                    results.append(result)

        return results

Example #13

0

Show file

File: filmstreaming_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        html = self._http_get(self.base_url,
                              params={'s': title},
                              cache_limit=1)
        for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}):
            match = re.search('href="([^"]+).*?alt="([^"]+)', item, re.DOTALL)
            if match:
                url, match_title_year = match.groups()
                match_title, match_year = scraper_utils.extra_year(
                    match_title_year)
                if not match_year:
                    year_fragment = dom_parser.parse_dom(
                        item, 'span', {'class': 'year'})
                    if year_fragment:
                        match_year = year_fragment[0]
                    else:
                        match_year = ''

                if not year or not match_year or year == match_year:
                    result = {
                        'url': scraper_utils.pathify_url(url),
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year
                    }
                    results.append(result)
        return results

Example #14

0

Show file

File: quikr_scraper.py Project: Lhse44/repository.deallen

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     search_title = re.sub('[^A-Za-z0-9. ]', '', title)
     url = '/search/%s/' % (urllib.quote(search_title))
     url = scraper_utils.urljoin(self.base_url, url)
     html = self._http_get(url, cache_limit=48)
     norm_title = scraper_utils.normalize_title(title)
     for _attrs, item in dom_parser2.parse_dom(html, 'article',
                                               {'class': 'movie-details'}):
         match_url = dom_parser2.parse_dom(item, 'a', req='href')
         match_title = dom_parser2.parse_dom(item, 'h2',
                                             {'class': 'movie-title'})
         match_year = dom_parser2.parse_dom(item, 'div',
                                            {'class': 'movie-year'})
         if match_url and match_title:
             match_url = match_url[0].attrs['href']
             match_title = match_title[0].content
             match_year = match_year[0].content if match_year else ''
             if norm_title in scraper_utils.normalize_title(
                     match_title) and (not match_year or not year
                                       or year == match_year):
                 result = {
                     'url': scraper_utils.pathify_url(match_url),
                     'title': scraper_utils.cleanse_title(match_title),
                     'year': match_year
                 }
                 results.append(result)
     return results

Example #15

0

Show file

File: movie4k_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/movies.php')
        cookies = {'onlylanguage': 'en', 'lang': 'en'}
        params = {'list': 'search', 'search': title}
        html = self._http_get(search_url, params=params, cookies=cookies, cache_limit=8)
        for _attrs, content in dom_parser2.parse_dom(html, 'TR', {'id': re.compile('coverPreview\d+')}):
            match = dom_parser2.parse_dom(content, 'a', req='href')
            if not match: continue
            
            match_url, match_title = match[0].attrs['href'], match[0].content
            is_show = re.search('\(tvshow\)', match_title, re.I)
            if (video_type == VIDEO_TYPES.MOVIE and is_show) or (video_type == VIDEO_TYPES.TVSHOW and not is_show):
                continue

            match_title = match_title.replace('(TVshow)', '')
            match_title = match_title.strip()
            
            match_year = ''
            for _attrs, div in dom_parser2.parse_dom(content, 'div'):
                match = re.match('\s*(\d{4})\s*', div)
                if match:
                    match_year = match.group(1)

            if not year or not match_year or year == match_year:
                result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                results.append(result)
        return results

Example #16

0

Show file

File: heydl_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        html = self._http_get(self.base_url,
                              params={'s': title},
                              cache_limit=8)
        for _attrs, item in dom_parser2.parse_dom(html, 'h2'):
            for attrs, match_title_year in dom_parser2.parse_dom(item,
                                                                 'a',
                                                                 req=['href']):
                match_url = attrs['href']
                match_title_year = re.sub('[^\x00-\x7F]', '', match_title_year)
                match = re.search('(.*?)\s+(\d{4})$', match_title_year)
                if match:
                    match_title, match_year = match.groups()
                else:
                    match = re.search('-(\d{4})/?$', match_url)
                    if match:
                        match_year = match.groups(1)
                    else:
                        match_title, match_year = match_title_year, ''

                if not year or not match_year or year == match_year:
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': scraper_utils.pathify_url(match_url)
                    }
                    results.append(result)

        return results

Example #17

0

Show file

File: mintmovies_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        html = self._http_get(self.base_url,
                              params={'s': title},
                              cache_limit=1)
        if re.search('Sorry, but nothing matched', html, re.I): return results

        norm_title = scraper_utils.normalize_title(title)
        for _attrs, item in dom_parser2.parse_dom(html, 'li',
                                                  {'class': 'box-shadow'}):
            for attrs, _content in dom_parser2.parse_dom(item,
                                                         'a',
                                                         req=['href',
                                                              'title']):
                match_url, match_title_year = attrs['href'], attrs['title']
                if re.search('S\d{2}E\d{2}', match_title_year): continue
                if re.search('TV\s*SERIES', match_title_year, re.I): continue
                match_title, match_year = scraper_utils.extra_year(
                    match_title_year)
                if (
                        not year or not match_year or year == match_year
                ) and norm_title in scraper_utils.normalize_title(match_title):
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': scraper_utils.pathify_url(match_url)
                    }
                    results.append(result)

        return results

Example #18

0

Show file

File: pronplus.py Project: Lhse44/repository.deallen

    def __get_links(self, url, video):
        hosters = []
        seen_urls = set()
        for search_type in SEARCH_TYPES:
            search_url, params = self.__translate_search(url, search_type)
            if not search_url: continue
            html = self._http_get(search_url, params=params, cache_limit=.5)
            js_result = scraper_utils.parse_json(html, search_url)
            if js_result.get('status') != 'success':
                logger.log('Pron API Error: |%s|%s|: %s' % (search_url, params, js_result.get('message', 'Unknown Error')), log_utils.LOGWARNING)
                continue
            
            for result in js_result['result']:
                stream_url = result['hosterurls'][0]['url']
                if len(result['hosterurls']) > 1: continue
                if result['extension'] == 'rar': continue
                if stream_url in seen_urls: continue

                if scraper_utils.release_check(video, result['title']):
                    host = urlparse.urlsplit(stream_url).hostname
                    quality = scraper_utils.get_quality(video, host, self._get_title_quality(result['title']))
                    hoster = {'multi-part': False, 'class': self, 'views': None, 'url': stream_url, 'rating': None, 'host': host, 'quality': quality, 'direct': False}
                    hoster['extra'] = scraper_utils.cleanse_title(result['title'])
                    if video.video_type == VIDEO_TYPES.MOVIE:
                        meta = scraper_utils.parse_movie_link(hoster['extra'])
                    else:
                        meta = scraper_utils.parse_episode_link(hoster['extra'])
                    if 'format' in meta: hoster['format'] = meta['format']
                    
                    hosters.append(hoster)
                    seen_urls.add(stream_url)

        return hosters

Example #19

0

Show file

File: losmovies_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search')
        params = {'type': 'movies', 'q': title}
        html = self._http_get(search_url, params=params, cache_limit=8)
        for _attrs, item in dom_parser2.parse_dom(
                html, 'div', {'id': re.compile('movie-\d+')}):
            is_tvshow = dom_parser2.parse_dom(item, 'div',
                                              {'class': 'movieTV'})
            if (video_type == VIDEO_TYPES.MOVIE
                    and is_tvshow) or (video_type == VIDEO_TYPES.TVSHOW
                                       and not is_tvshow):
                continue

            match_url = dom_parser2.parse_dom(item, 'a', req='href')
            match_title = dom_parser2.parse_dom(item, 'h4')
            if match_url and match_title:
                match_title = match_title[0].content
                match_url = match_url[0].attrs['href']
                match_year = ''
                if not year or not match_year or year == match_year:
                    result = {
                        'url': scraper_utils.pathify_url(match_url),
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year
                    }
                    results.append(result)
        return results

Example #20

0

Show file

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        if not self.include_paid and video_type != VIDEO_TYPES.MOVIE: return []
        search_url = scraper_utils.urljoin(self.base_url, '/search.php')
        html = self._http_get(search_url, params={'q': title}, cache_limit=.25)
        results = []
        if video_type == VIDEO_TYPES.MOVIE:
            pattern = '<i>\s*Movies\s*</i>(.*)'
        else:
            pattern = '<i>\s*TV Series\s*</i>(.*)'

        match = re.search(pattern, html)
        if not match: return results

        container = match.group(1)
        pattern = "href='([^']+)'>([^<]+)\s*</a>\s*(?:\((\d{4})\))?"
        for match in re.finditer(pattern, container):
            url, match_title, match_year = match.groups('')
            if not year or not match_year or year == match_year:
                result = {
                    'url': scraper_utils.pathify_url(url),
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year
                }
                results.append(result)

        return results

Example #21

0

Show file

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        if title:
            first_letter = title[:1].lower()
            if first_letter.isdigit(): first_letter = '0-9'
            search_url = '/search.php/%s/' % (first_letter)
            search_url = urlparse.urljoin(self.base_url, search_url)
            html = self._http_get(search_url, cache_limit=24)
            fragment = dom_parser.parse_dom(html, 'div', {'class': 'home'})
            if fragment:
                norm_title = scraper_utils.normalize_title(title)
                for match in re.finditer('''href=["']([^'"]+)[^>]+>([^<]+)''',
                                         fragment[0]):
                    url, match_title_year = match.groups()
                    match_title, match_year = scraper_utils.extra_year(
                        match_title_year)
                    if norm_title in scraper_utils.normalize_title(
                            match_title) and (not year or not match_year
                                              or year == match_year):
                        result = {
                            'url': scraper_utils.pathify_url(url),
                            'title': scraper_utils.cleanse_title(match_title),
                            'year': match_year
                        }
                        results.append(result)

        return results

Example #22

0

Show file

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        search_url = '/Category-FilmsAndTV/Genre-Any/Letter-Any/ByPopularity/1/Search-%s.htm' % (
            urllib.quote(title))
        search_url = scraper_utils.urljoin(self.base_url, search_url)
        html = self._http_get(search_url, cache_limit=8)

        results = []
        for _attrs, result in dom_parser2.parse_dom(html, 'div',
                                                    {'class': 'searchResult'}):
            match_url = dom_parser2.parse_dom(result,
                                              'a', {'itemprop': 'url'},
                                              req='href')
            match_title = dom_parser2.parse_dom(result, 'span',
                                                {'itemprop': 'name'})
            match_year = dom_parser2.parse_dom(result, 'span',
                                               {'itemprop': 'copyrightYear'})
            match_year = match_year[0].content if match_year else ''

            if match_url and match_title and (not year or not match_year
                                              or year == match_year):
                match_url = match_url[0].attrs['href']
                match_title = match_title[0].content
                if FRAGMENTS[video_type] not in match_url.lower(): continue
                result = {
                    'url': scraper_utils.pathify_url(match_url),
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year
                }
                results.append(result)
        return results

Example #23

0

Show file

File: iwatch_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_in = 'm' if video_type == VIDEO_TYPES.MOVIE else 't'
        search_url = scraper_utils.urljoin(self.base_url, '/search')
        html = self._http_get(search_url,
                              data={
                                  'searchquery': title,
                                  'searchin': search_in
                              },
                              cache_limit=8)
        fragment = dom_parser2.parse_dom(html, 'div', {'class': 'search-page'})
        if not fragment: return results
        fragment = dom_parser2.parse_dom(fragment[0].content, 'table')
        if not fragment: return results
        for attrs, match_title_year in dom_parser2.parse_dom(
                fragment[0].content, 'a', req='href'):
            match_url = attrs['href']
            match_title, match_year = scraper_utils.extra_year(
                match_title_year)
            if not year or not match_year or year == match_year:
                result = {
                    'url': scraper_utils.pathify_url(match_url),
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year
                }
                results.append(result)

        return results

Example #24

0

Show file

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        xml_url = scraper_utils.urljoin(self.base_url, '/series.xml')
        xml = self._http_get(xml_url, cache_limit=24)
        if not xml: return results
        try:
            norm_title = scraper_utils.normalize_title(title)
            match_year = ''
            for element in ET.fromstring(xml).findall('.//dizi'):
                name = element.find('adi')
                if name is not None and norm_title in scraper_utils.normalize_title(
                        name.text):
                    url = element.find('url')
                    if url is not None and (not year or not match_year
                                            or year == match_year):
                        result = {
                            'url': scraper_utils.pathify_url(url.text),
                            'title': scraper_utils.cleanse_title(name.text),
                            'year': ''
                        }
                        results.append(result)
        except (ParseError, ExpatError) as e:
            logger.log('Dizilab Search Parse Error: %s' % (e),
                       log_utils.LOGWARNING)

        return results

Example #25

0

Show file

File: putlocker_scraper.py Project: Lhse44/repository.deallen

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     headers = {'Referer': self.base_url}
     params = {'search': title}
     html = self._http_get(self.base_url,
                           params=params,
                           headers=headers,
                           cache_limit=8)
     for item in dom_parser.parse_dom(html, 'div', {'class': 'listCard'}):
         match_title = dom_parser.parse_dom(item, 'p',
                                            {'class': 'extraTitle'})
         match_url = dom_parser.parse_dom(item, 'a', ret='href')
         match_year = dom_parser.parse_dom(item, 'p', {'class': 'cardYear'})
         if match_url and match_title:
             match_url = match_url[0]
             match_title = match_title[0]
             match_year = match_year[0] if match_year else ''
             if not year or not match_year or year == match_year:
                 result = {
                     'url': scraper_utils.pathify_url(match_url),
                     'title': scraper_utils.cleanse_title(match_title),
                     'year': match_year
                 }
                 results.append(result)
     return results

Example #26

0

Show file

File: ol_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        html = self._http_get(self.base_url,
                              params={'s': title},
                              cache_limit=8)
        for _attrs, item in dom_parser2.parse_dom(html, 'div',
                                                  {'class': 'result-item'}):
            match = dom_parser2.parse_dom(item, 'div', {'class': 'title'})
            is_movie = dom_parser2.parse_dom(item, 'span', {'class': 'movies'})
            if not is_movie or not match: return results

            match = dom_parser2.parse_dom(match[0].content, 'a', req='href')
            if not match: return results

            match_url, match_title_year = match[0].attrs['href'], match[
                0].content
            match_title, match_year = scraper_utils.extra_year(
                match_title_year)
            if not match_year:
                match_year = dom_parser2.parse_dom(item, 'span',
                                                   {'class': 'year'})
                match_year = match_year[0].content if match_year else ''

            if not year or not match_year or year == match_year:
                result = {
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year,
                    'url': scraper_utils.pathify_url(match_url)
                }
                results.append(result)

        return results

Example #27

0

Show file

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        html = self._http_get(self.base_url,
                              params={'s': title},
                              cache_limit=4)
        for _attrs, movie in dom_parser2.parse_dom(html, 'div',
                                                   {'class': 'movie'}):
            match_url = dom_parser2.parse_dom(movie, 'a', req='href')
            match_title_year = dom_parser2.parse_dom(movie, 'img', req='alt')
            if match_url and match_title_year:
                match_url = match_url[0].attrs['href']
                if re.search('season-\d+-episode\d+', match_url): continue
                match_title_year = match_title_year[0].attrs['alt']

                match_title, match_year = scraper_utils.extra_year(
                    match_title_year)
                if not match_year:
                    match_year = dom_parser2.parse_dom(movie, 'div',
                                                       {'class': 'year'})
                    try:
                        match_year = match_year[0].content
                    except:
                        match_year = ''

                if not year or not match_year or year == match_year:
                    result = {
                        'url': scraper_utils.pathify_url(match_url),
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year
                    }
                    results.append(result)

        return results

Example #28

0

Show file

File: watch8now_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        search_url = urlparse.urljoin(self.base_url, '/search')
        html = self._http_get(search_url, params={'q': title}, cache_limit=8)
        results = []
        for item in dom_parser.parse_dom(html, 'td', {'class': 'col-md-10'}):
            match = re.search('href="([^"]+)">([^<]+)', item)
            if match:
                url, match_title = match.groups()
                result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''}
                results.append(result)

        return results

Example #29

0

Show file

    def search(self, video_type, title, year, season=''):
        html = self._http_get(self.base_url, cache_limit=8)
        results = []
        norm_title = scraper_utils.normalize_title(title)
        pattern = 'class="[^"]*cat-item.*?href="([^"]+)[^>]+>([^<]+)'
        for match in re.finditer(pattern, html):
            url, match_title = match.groups()
            if norm_title in scraper_utils.normalize_title(match_title):
                result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''}
                results.append(result)

        return results

Example #30

0

Show file

File: projectfree_scraper.py Project: Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        norm_title = scraper_utils.normalize_title(title)
        url = scraper_utils.urljoin(self.base_url, '/search/')
        headers = {'Referer': self.base_url}
        html = self._http_get(url, headers=headers, cache_limit=8)
        for _attrs, item in dom_parser2.parse_dom(html, 'li'):
            for attrs, _content in dom_parser2.parse_dom(item, 'a', req=['title', 'href']):
                match_title, match_url = attrs['title'], attrs['href']
                if norm_title in scraper_utils.normalize_title(match_title):
                    result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''}
                    results.append(result)

        return results