Ejemplos de cleanse_title en Python, ejemplos de scraper_utils.cleanse_title en Python

Ejemplo n.º 1

0

Mostrar archivo

    def search(self, video_type, title, year, season=''):
        search_url = urlparse.urljoin(self.base_url,
                                      '/index.php?search_keywords=')
        search_url += urllib.quote_plus(title)
        search_url += '&year=' + urllib.quote_plus(str(year))
        if video_type == 'shows':
            search_url += '&search_section=2'
        else:
            search_url += '&search_section=1'
        results = []
        html = self._http_get(self.base_url, cache_limit=0)
        #kodi.log("HTML is : " + html)
        match = re.search('input type="hidden" name="key" value="([0-9a-f]*)"',
                          html)
        if match:
            key = match.group(1)
            search_url += '&key=' + key

            html = self._http_get(search_url, cache_limit=.25)
            pattern = r'class="index_item.+?href="(.+?)" title="Watch (.+?)"?\(?([0-9]{4})?\)?"?>'
            for match in re.finditer(pattern, html):
                url, title, year = match.groups('')
                result = {
                    'url': scraper_utils.pathify_url(url),
                    'title': scraper_utils.cleanse_title(title),
                    'year': year
                }
                results.append(result)
        else:
            log_utils.log('Unable to locate PW search key',
                          log_utils.LOGWARNING)
        return results

Ejemplo n.º 2

0

Mostrar archivo

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = urlparse.urljoin(self.base_url, '/search/' + title)
        #search_url = urlparse.urljoin(self.base_url, '/suggest.php')
        headers = {'Referer': self.base_url}
        headers.update(XHR)
        params = {'ajax': 1, 's': title, 'type': 'TVShows'}
        html = self._http_get(search_url, params=params, cache_limit=8)
        kodi.log(html)
        for match in re.finditer('href="([^"]+)[^>]*>(.*?)</a>', html):
            match_url, match_title = match.groups()
            match_title = re.sub('</?span[^>]*>', '', match_title)
            match = re.search('\((\d{4})\)$', match_url)
            if match:
                match_year = match.group(1)
            else:
                match_year = ''

            if not year or not match_year or year == match_year:
                result = {
                    'url': scraper_utils.pathify_url(match_url),
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year
                }
                results.append(result)

        return results

Ejemplo n.º 3

0

Mostrar archivo

Archivo: putlocker_both.py Proyecto: ItsMYZTIK/tdbaddon

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     headers = {'Referer': self.base_url}
     params = {'search': title}
     html = self._http_get(self.base_url,
                           params=params,
                           headers=headers,
                           cache_limit=8)
     for item in dom_parser.parse_dom(html, 'div', {'class': 'listCard'}):
         match_title = dom_parser.parse_dom(item, 'p',
                                            {'class': 'extraTitle'})
         match_url = dom_parser.parse_dom(item, 'a', ret='href')
         match_year = dom_parser.parse_dom(item, 'p', {'class': 'cardYear'})
         if match_url and match_title:
             match_url = match_url[0]
             match_title = match_title[0]
             match_year = match_year[0] if match_year else ''
             if not year or not match_year or year == match_year:
                 result = {
                     'url': scraper_utils.pathify_url(match_url),
                     'title': scraper_utils.cleanse_title(match_title),
                     'year': match_year
                 }
                 results.append(result)
     return results

Ejemplo n.º 4

0

Mostrar archivo

Archivo: swatch_scraper.py Proyecto: ItsMYZTIK/tdbaddon

 def __movie_search(
     self,
     title,
     year,
 ):
     results = []
     norm_title = scraper_utils.normalize_title(title)
     html = self._http_get(self.base_url, cache_limit=48)
     for item in self._parse_directory(html):
         if not item['directory']:
             meta = scraper_utils.parse_movie_link(item['title'])
             if meta['dubbed']: continue
             if (norm_title in scraper_utils.normalize_title(
                     meta['title'])) and (not year or not meta['year']
                                          or year == meta['year']):
                 match_title = meta['title'].replace('.', ' ')
                 match_title += ' [%sp.%s]' % (meta['height'],
                                               meta['extra'])
                 result = {
                     'url': scraper_utils.pathify_url(item['link']),
                     'title': scraper_utils.cleanse_title(match_title),
                     'year': meta['year']
                 }
                 results.append(result)
     return results

Ejemplo n.º 5

0

Mostrar archivo

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(
            self.base_url,
            '/?s=%s&submit=Search+Now!' % (urllib.quote_plus(title)))
        headers = {'Referer': search_url}
        html = self._http_get(search_url, headers=headers, cache_limit=8)
        index = 0 if video_type == 'shows' else 1
        fragments = re.findall('<h2.*?(?=<h2|$)', html, re.DOTALL)
        if len(fragments) > index:
            for item in dom_parser.parse_dom(fragments[index], 'div',
                                             {'class': 'aaa_item'}):
                match_title_year = dom_parser.parse_dom(item, 'a', ret='title')
                match_url = dom_parser.parse_dom(item, 'a', ret='href')
                if match_title_year and match_url:
                    match_url = match_url[0]
                    match_title_year = match_title_year[0]
                    match = re.search('(.*?)\s+\((\d{4})\)', match_title_year)
                    if match:
                        match_title, match_year = match.groups()
                    else:
                        match_title = match_title_year
                        match_year = ''

                    if not year or not match_year or year == match_year:
                        result = {
                            'url': scraper_utils.pathify_url(match_url),
                            'title': scraper_utils.cleanse_title(match_title),
                            'year': match_year
                        }
                        results.append(result)
        return results

Ejemplo n.º 6

0

Mostrar archivo

Archivo: swatch_scraper.py Proyecto: ItsMYZTIK/tdbaddon

 def search(self, video_type, title, year, season=''):
     results = []
     if video_type == 'movies':
         results = self.__movie_search(title, year)
     else:
         norm_title = scraper_utils.normalize_title(title)
         html = self._http_get(self.base_url, cache_limit=48)
         for item in self._parse_directory(html):
             if norm_title in scraper_utils.normalize_title(
                     item['title']) and item['directory']:
                 result = {
                     'url': scraper_utils.pathify_url(item['link']),
                     'title': scraper_utils.cleanse_title(item['title']),
                     'year': ''
                 }
                 results.append(result)
     return results

Ejemplo n.º 7

0

Mostrar archivo

Archivo: afdah_scraper.py Proyecto: krzysztofuu/Bonitillonew

 def search(self, video_type, title, year, season=''):
     search_url = urlparse.urljoin(
         self.base_url, '/wp-content/themes/afdah/ajax-search.php')
     #kodi.log(search_url)
     data = {'search': title, 'type': 'title'}
     html = self._http_get(search_url, data=data, cache_limit=1)
     #kodi.log(html)
     pattern = '<li>.*?href="([^"]+)">([^<]+)\s+\((\d{4})\)'
     results = []
     for match in re.finditer(pattern, html, re.DOTALL | re.I):
         url, match_title, match_year = match.groups('')
         if not year or not match_year or year == match_year:
             result = {
                 'url': scraper_utils.pathify_url(url),
                 'title': scraper_utils.cleanse_title(match_title),
                 'year': year
             }
             results.append(result)
     return results

Ejemplo n.º 8

0

Mostrar archivo

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        folders = ['/addons/real-movies/base.xml']
        norm_title = scraper_utils.normalize_title(title)
        for page_url in folders:
            #kodi.log(page_url)
            xml_file = os.path.basename(page_url)
            page_url = urlparse.urljoin(self.base_url, page_url)
            xml = self._http_get(page_url, require_debrid=True, cache_limit=48)
            new_folders = re.findall('<folder>(.*?)</folder>', xml, re.I)
            if new_folders:
                folders += [folder for folder in new_folders if folder]

            for match in re.finditer('<item>(.*?)</item>', xml,
                                     re.I | re.DOTALL):
                item = match.group(1)
                match_title_year = re.search('<title>(.*?)</title>', item,
                                             re.I)
                match_url = re.search('<link>(.*?)</link>', item, re.I)
                if match_title_year and match_url:
                    match_title_year = match_title_year.group(1)
                    match_url = match_url.group(1)
                    if match_title_year and match_url:
                        match_title, match_year = scraper_utils.extra_year(
                            match_title_year)
                        xml_file = xml_file.replace(' ', '').lower()
                        match_url = 'xml_file=%s&link=%s' % (xml_file,
                                                             match_url)
                        if norm_title in scraper_utils.normalize_title(
                                match_title) and (not year or not match_year
                                                  or year == match_year):
                            if 'format' in XML_META.get(xml_file, {}):
                                match_title += ' (%s)' % (
                                    XML_META[xml_file]['format'])
                            result = {
                                'title':
                                scraper_utils.cleanse_title(match_title),
                                'year': match_year,
                                'url': match_url
                            }
                            results.append(result)
        return results

Ejemplo n.º 9

0

Mostrar archivo

    def search(self, video_type, title, year, season=''):
        results = []
        if video_type == 'movies':
            url = urlparse.urljoin(self.base_url, '/movies/a-z/')
        else:
            url = urlparse.urljoin(self.base_url, '/tv/a-z/')

        if title.upper().startswith('THE '):
            search_title = title[4:5]
        elif title.upper().startswith('A '):
            search_title = title[2:3]
        else:
            search_title = title

        if title[:1] in string.digits:
            first_letter = '1'
        else:
            first_letter = search_title[:1]
        url = url + first_letter.upper()

        html = self._http_get(url, cache_limit=48)
        norm_title = scraper_utils.normalize_title(title)
        pattern = 'class=star.*?href=([^>]+)>(.*?)</a>'
        for match in re.finditer(pattern, html, re.DOTALL):
            match_url, match_title_year = match.groups()
            match = re.search('(.*?)\s+\((\d{4})\)', match_title_year)
            if match:
                match_title, match_year = match.groups()
            else:
                match_title = match_title_year
                match_year = ''

            if norm_title in scraper_utils.normalize_title(match_title) and (
                    not year or not match_year or year == match_year):
                result = {
                    'url': match_url,
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year
                }
                results.append(result)
        return results

Ejemplo n.º 10

0

Mostrar archivo

Archivo: watchepisodes.py Proyecto: ItsMYZTIK/tdbaddon

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = urlparse.urljoin(self.base_url, '/search/ajax_search')
        html = self._http_get(search_url,
                              params={'q': title},
                              headers=XHR,
                              cache_limit=1)
        js_result = scraper_utils.parse_json(html, search_url)
        match_year = ''
        if 'series' in js_result:
            for series in js_result['series']:
                if 'seo' in series and 'label' in series:
                    if not year or not match_year or year == match_year:
                        result = {
                            'url':
                            scraper_utils.pathify_url('/' + series['seo']),
                            'title':
                            scraper_utils.cleanse_title(series['label']),
                            'year': match_year
                        }
                        results.append(result)

        return results

Ejemplo n.º 11

0

Mostrar archivo

Archivo: iwatchonline.py Proyecto: krzysztofuu/Bonitillonew

 def search(self, video_type, title, year, season=''):
     results = []
     search_url = urlparse.urljoin(self.base_url, '/search')
     if video_type == 'movies':
         data = {'searchin': 'm'}
     else:
         data = {'searchin': 't'}
     data.update({'searchquery': title})
     html = self._http_get(search_url, data=data, cache_limit=8)
     pattern = r'href="([^"]+)">(.*?)\s+\((\d{4})\)'
     for match in re.finditer(pattern, html):
         url, title, match_year = match.groups('')
         if not year or not match_year or year == match_year:
             url = url.replace(
                 '/episode/',
                 '/tv-shows/')  # fix wrong url returned from search results
             result = {
                 'url': scraper_utils.pathify_url(url),
                 'title': scraper_utils.cleanse_title(title),
                 'year': match_year
             }
             results.append(result)
     return results

Ejemplo n.º 12

0

Mostrar archivo

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     search_url = urlparse.urljoin(
         self.base_url, '/wp-content/themes/afdah/ajax-search.php')
     data = {'search': title, 'type': 'title'}
     html = self._http_get(search_url,
                           data=data,
                           headers=XHR,
                           cache_limit=1)
     for item in dom_parser.parse_dom(html, 'li'):
         match_url = dom_parser.parse_dom(item, 'a', ret='href')
         match_title_year = dom_parser.parse_dom(item, 'a')
         if match_url and match_title_year:
             match_url = match_url[0]
             match_title, match_year = scraper_utils.extra_year(
                 match_title_year[0])
             if not year or not match_year or year == match_year:
                 result = {
                     'url': scraper_utils.pathify_url(match_url),
                     'title': scraper_utils.cleanse_title(match_title),
                     'year': match_year
                 }
                 results.append(result)
     return results

Ejemplo n.º 13

0

Mostrar archivo

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     search_url = urlparse.urljoin(self.base_url, '/search/')
     search_url += urllib.quote_plus(title)
     html = self._http_get(search_url, require_debrid=True, cache_limit=1)
     if video_type == 'shows':
         seen_urls = {}
         for post in dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'}):
             if 'shows' in post:
                 match = re.search('<span>\s*TAGS:\s*</span>\s*<a\s+href="([^"]+)[^>]+>([^<]+)', post, re.I)
                 if match:
                     show_url, match_title = match.groups()
                     if show_url not in seen_urls:
                         result = {'url': scraper_utils.pathify_url(show_url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''}
                         seen_urls[show_url] = result
                         results.append(result)
     elif video_type == 'movies':
         headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html)
         posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'})
         norm_title = scraper_utils.normalize_title(title)
         for heading, post in zip(headings, posts):
             if 'movies' in post and not self.__too_old(post):
                 post_url, post_title = heading
                 match = re.search('(.*?)\s*[.\[(]?(\d{4})[.)\]]?\s*(.*)', post_title)
                 if match:
                     match_title, match_year, extra_title = match.groups()
                     full_title = '%s [%s]' % (match_title, extra_title)
                 else:
                     full_title = match_title = post_title
                     match_year = ''
                 
                 match_norm_title = scraper_utils.normalize_title(match_title)
                 if (match_norm_title in norm_title or norm_title in match_norm_title) and (not year or not match_year or year == match_year):
                     result = {'url': scraper_utils.pathify_url(post_url), 'title': scraper_utils.cleanse_title(full_title), 'year': match_year}
                     results.append(result)
     
     return results

Ejemplo n.º 14

0

Mostrar archivo

    def _blog_proc_results(self, html, post_pattern, date_format, video_type,
                           title, year):
        results = []
        search_date = ''
        search_sxe = ''
        if video_type == VIDEO_TYPES.EPISODE:
            match = re.search('(.*?)\s*(S\d+E\d+)\s*', title)
            if match:
                show_title, search_sxe = match.groups()
            else:
                match = re.search(
                    '(.*?)\s*(\d{4})[._ -]?(\d{2})[._ -]?(\d{2})\s*', title)
                if match:
                    show_title, search_year, search_month, search_day = match.groups(
                    )
                    search_date = '%s-%s-%s' % (search_year, search_month,
                                                search_day)
                    search_date = scraper_utils.to_datetime(
                        search_date, "%Y-%m-%d").date()
                else:
                    show_title = title
        else:
            show_title = title

        today = datetime.date.today()
        for match in re.finditer(post_pattern, html, re.DOTALL):
            post_data = match.groupdict()
            post_title = post_data['post_title']
            post_title = re.sub('<[^>]*>', '', post_title)
            if 'quality' in post_data:
                post_title += '- [%s]' % (post_data['quality'])

            try:
                filter_days = int(
                    kodi.get_setting('%s-filter' % (self.get_name())))
            except ValueError:
                filter_days = 0
            if filter_days and date_format and 'date' in post_data:
                post_data['date'] = post_data['date'].strip()
                filter_days = datetime.timedelta(days=filter_days)
                post_date = scraper_utils.to_datetime(post_data['date'],
                                                      date_format).date()
                if not post_date:
                    log_utils.log(
                        'Failed date Check in %s: |%s|%s|%s|' %
                        (self.get_name(), post_data['date'], date_format),
                        log_utils.LOGWARNING)
                    post_date = today

                if today - post_date > filter_days:
                    continue

            match_year = ''
            match_date = ''
            match_sxe = ''
            match_title = full_title = post_title
            if video_type == VIDEO_TYPES.MOVIE:
                meta = scraper_utils.parse_movie_link(post_title)
                match_year = meta['year']
            else:
                meta = scraper_utils.parse_episode_link(post_title)
                match_sxe = 'S%02dE%02d' % (int(
                    meta['season']), int(meta['episode']))
                match_date = meta['airdate']

            match_title = meta['title']
            full_title = '%s (%sp) [%s]' % (meta['title'], meta['height'],
                                            meta['extra'])
            norm_title = scraper_utils.normalize_title(show_title)
            match_norm_title = scraper_utils.normalize_title(match_title)
            title_match = norm_title and (match_norm_title in norm_title
                                          or norm_title in match_norm_title)
            year_match = not year or not match_year or year == match_year
            sxe_match = not search_sxe or (search_sxe == match_sxe)
            date_match = not search_date or (search_date == match_date)
            log_utils.log(
                'Blog Results: |%s|%s|%s| - |%s|%s|%s| - |%s|%s|%s| - |%s|%s|%s| (%s)'
                % (match_norm_title, norm_title, title_match, year, match_year,
                   year_match, search_date, match_date, date_match, search_sxe,
                   match_sxe, sxe_match, self.get_name()), log_utils.LOGDEBUG)
            if title_match and year_match and date_match and sxe_match:
                result = {
                    'url': scraper_utils.pathify_url(post_data['url']),
                    'title': scraper_utils.cleanse_title(full_title),
                    'year': match_year
                }
                results.append(result)
        return results

Ejemplo n.º 15

0

Mostrar archivo

Archivo: recaptcha_v2.py Proyecto: Lhse44/repository.deallen

    def processCaptcha(self, key, lang, name=None, referer=None):
        if referer is None:
            referer = 'https://www.google.com/recaptcha/api2/demo'
        headers = {'Referer': referer, 'Accept-Language': lang}
        html = get_url('http://www.google.com/recaptcha/api/fallback?k=%s' %
                       (key),
                       headers=headers)
        token = ''
        iteration = 0
        while True:
            payload = dom_parser2.parse_dom(
                html, 'img', {'class': 'fbc-imageselect-payload'}, req='src')
            iteration += 1
            message = dom_parser2.parse_dom(
                html, 'label', {'class': 'fbc-imageselect-message-text'})
            if not message:
                message = dom_parser2.parse_dom(
                    html, 'div', {'class': 'fbc-imageselect-message-error'})

            if message and payload:
                message = message[0].content
                payload = payload[0].attrs['src']
            else:
                token = dom_parser2.parse_dom(
                    html, 'div', {'class': 'fbc-verification-token'})
                if token:
                    token = dom_parser2.parse_dom(token[0].content,
                                                  'textarea')[0].content
                    logger.log('Captcha Success: %s' % (token),
                               log_utils.LOGDEBUG)
                else:
                    logger.log('Captcha Failed', log_utils.LOGDEBUG)
                break

            cval = dom_parser2.parse_dom(html,
                                         'input', {'name': 'c'},
                                         req='value')
            if not cval: break

            cval = cval[0].attrs['value']
            captcha_imgurl = scraper_utils.urljoin(
                'https://www.google.com', scraper_utils.cleanse_title(payload))
            message = message.replace('<strong>',
                                      '[B]').replace('</strong>', '[/B]')
            message = re.sub(re.compile('</?(div|strong)[^>]*>', re.I), '',
                             message)
            if any(c for c in ['<', '>'] if c in message):
                logger.log('Suspicious Captcha Prompt: %s' % (message),
                           log_utils.LOGWARNING)

            oSolver = cInputWindow(captcha=captcha_imgurl,
                                   msg=message,
                                   iteration=iteration,
                                   name=name)
            captcha_response = oSolver.get()
            if not captcha_response:
                break

            data = {'c': cval, 'response': captcha_response}
            html = get_url(
                "http://www.google.com/recaptcha/api/fallback?k=%s" % (key),
                data=data,
                headers=headers)
        return token