Python parse_domの例、libs.dom_parser.parse_dom Pythonの例

コード例 #1

0

ファイルを表示

 def _get_episode_url(self, show_url, video):
     force_title = scraper_utils.force_title(video)
     title_fallback = kodi.get_setting('title-fallback') == 'true'
     norm_title = scraper_utils.normalize_title(video.ep_title)
     page_url = [show_url]
     too_old = False
     while page_url and not too_old:
         url = urlparse.urljoin(self.base_url, page_url[0])
         html = self._http_get(url, require_debrid=True, cache_limit=1)
         posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'})
         for post in posts:
             if self.__too_old(post):
                 too_old = True
                 break
             if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post:
                 match = re.search('<a\s+href="([^"]+)[^>]+>(.*?)</a>', post)
                 if match:
                     url, title = match.groups()
                     if not force_title:
                         if scraper_utils.release_check(video, title, require_title=False):
                             return scraper_utils.pathify_url(url)
                     else:
                         if title_fallback and norm_title:
                             match = re.search('</strong>(.*?)</p>', post)
                             if match and norm_title == scraper_utils.normalize_title(match.group(1)):
                                 return scraper_utils.pathify_url(url)
             
         page_url = dom_parser.parse_dom(html, 'a', {'class': 'nextpostslink'}, ret='href')

コード例 #2

0

ファイルを表示

ファイル: putlocker_both.py プロジェクト: ItsMYZTIK/tdbaddon

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     headers = {'Referer': self.base_url}
     params = {'search': title}
     html = self._http_get(self.base_url,
                           params=params,
                           headers=headers,
                           cache_limit=8)
     for item in dom_parser.parse_dom(html, 'div', {'class': 'listCard'}):
         match_title = dom_parser.parse_dom(item, 'p',
                                            {'class': 'extraTitle'})
         match_url = dom_parser.parse_dom(item, 'a', ret='href')
         match_year = dom_parser.parse_dom(item, 'p', {'class': 'cardYear'})
         if match_url and match_title:
             match_url = match_url[0]
             match_title = match_title[0]
             match_year = match_year[0] if match_year else ''
             if not year or not match_year or year == match_year:
                 result = {
                     'url': scraper_utils.pathify_url(match_url),
                     'title': scraper_utils.cleanse_title(match_title),
                     'year': match_year
                 }
                 results.append(result)
     return results

コード例 #3

0

ファイルを表示

 def get_sources(self, video, video_type):
     source_url = self.get_url(video)
     hosters = []
     if source_url and source_url != FORCE_NO_MATCH:
         page_url = urlparse.urljoin(self.base_url, source_url)
         headers = {'Refer': self.base_url}
         html = self._http_get(page_url, headers=headers, cache_limit=.5)
         table = dom_parser.parse_dom(html, 'div', {'class': 'linktable'})
         if table:
             for row in dom_parser.parse_dom(table[0], 'tr'):
                 spans = dom_parser.parse_dom(row, 'span')
                 stream_url = dom_parser.parse_dom(row, 'a', ret='href')
                 is_sponsored = any(
                     [i for i in spans if 'sponsored' in i.lower()])
                 if not is_sponsored and len(spans) > 1 and stream_url:
                     host, rating = spans[0], spans[1]
                     stream_url = stream_url[0]
                     quality = scraper_utils.get_quality(
                         video, host, QUALITIES.HIGH)
                     hoster = {
                         'multi-part': False,
                         'host': host,
                         'class': self,
                         'quality': quality,
                         'views': None,
                         'rating': None,
                         'url': stream_url,
                         'direct': False
                     }
                     if 'rating'.endswith('%') and rating[:-1].isdigit():
                         hoster['rating'] = rating[:-1]
                     hosters.append(hoster)
     return hosters

コード例 #4

0

ファイルを表示

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(
            self.base_url,
            '/?s=%s&submit=Search+Now!' % (urllib.quote_plus(title)))
        headers = {'Referer': search_url}
        html = self._http_get(search_url, headers=headers, cache_limit=8)
        index = 0 if video_type == 'shows' else 1
        fragments = re.findall('<h2.*?(?=<h2|$)', html, re.DOTALL)
        if len(fragments) > index:
            for item in dom_parser.parse_dom(fragments[index], 'div',
                                             {'class': 'aaa_item'}):
                match_title_year = dom_parser.parse_dom(item, 'a', ret='title')
                match_url = dom_parser.parse_dom(item, 'a', ret='href')
                if match_title_year and match_url:
                    match_url = match_url[0]
                    match_title_year = match_title_year[0]
                    match = re.search('(.*?)\s+\((\d{4})\)', match_title_year)
                    if match:
                        match_title, match_year = match.groups()
                    else:
                        match_title = match_title_year
                        match_year = ''

                    if not year or not match_year or year == match_year:
                        result = {
                            'url': scraper_utils.pathify_url(match_url),
                            'title': scraper_utils.cleanse_title(match_title),
                            'year': match_year
                        }
                        results.append(result)
        return results

コード例 #5

0

ファイルを表示

ファイル: iwatchonline.py プロジェクト: krzysztofuu/Bonitillonew

    def get_sources(self, video, video_type):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)

            fragment = dom_parser.parse_dom(html, 'table',
                                            {'id': 'streamlinks'})
            if fragment:
                max_age = 0
                now = min_age = int(time.time())
                for row in dom_parser.parse_dom(fragment[0], 'tr',
                                                {'id': 'pt\d+'}):
                    if video_type == 'movies':
                        pattern = 'href="([^"]+).*?/>([^<]+).*?(?:<td>.*?</td>\s*){1}<td>(.*?)</td>\s*<td>(.*?)</td>'
                    else:
                        pattern = 'href="([^"]+).*?/>([^<]+).*?(<span class="linkdate">.*?)</td>\s*<td>(.*?)</td>'
                    match = re.search(pattern, row, re.DOTALL)
                    if match:
                        url, host, age, quality = match.groups()
                        age = self.__get_age(now, age)
                        quality = quality.upper()
                        if age > max_age: max_age = age
                        if age < min_age: min_age = age
                        host = host.strip()
                        hoster = {
                            'hostname': 'iWatchOnline',
                            'multi-part': False,
                            'class': '',
                            'url': self.resolve_link(url),
                            'host': host,
                            'age': age,
                            'views': None,
                            'rating': None,
                            'direct': False
                        }
                        hoster['quality'] = scraper_utils.get_quality(
                            video, host,
                            QUALITY_MAP.get(quality, QUALITIES.HIGH))
                        hosters.append(hoster)

                unit = (max_age - min_age) / 100
                if unit > 0:
                    for hoster in hosters:
                        hoster['rating'] = (hoster['age'] - min_age) / unit

        main_scrape.apply_urlresolver(hosters)
        return hosters

コード例 #6

0

ファイルを表示

ファイル: default.py プロジェクト: vahanminassian/plugin.program.indigo

def what_sports():
    link = OPEN_URL('http://www.wheresthematch.com/tv/home.asp').replace('\r', '').replace('\n', '').replace('\t', '')
    match = re.compile('href="http://www.wheresthematch.com/fixtures/(.+?).asp.+?class="">(.+?)</em> <em class="">v</em> <em class="">(.+?)</em>.+?time-channel ">(.+?)</span>').findall(link)
    for game, name1, name2, gametime in match:
        kodi.addItem('[COLOR gold][B]' + game + ' ' + '[/COLOR][/B]- [COLOR white]' + name1 + ' vs ' + name2 + ' - ' + gametime + ' [/COLOR]', '', '', artwork + 'icon.png',
                     description='[COLOR gold][B]'+game+' '+'[/COLOR][/B]- [COLOR white]'+name1+' vs '+name2+' - '+gametime+' [/COLOR]')
        xbmc.executebuiltin("Container.SetViewMode(55)")
    # #######AMERICAN###############
    link = OPEN_URL('http://www.tvguide.com/sports/live-today/').replace('\r', '').replace('\n', '').replace('\t', '')
    sections = dom_parser.parse_dom(link, 'div', {'class': "listings-program-content"})
    listings = dom_parser.parse_dom(sections, 'span', {'class': "listings-program-link"})
    for stuff in sections:
        match = re.compile('class="listings-program-link">(.+?)</span></h3>.+?class="listings-program-link">.+?listings-program-airing-info">(.+?)</p><p.+?description">(.+?)</p>').findall(stuff)
        for name, time, description in match:
            kodi.addItem('[COLOR gold][B]' + name_cleaner(name) + ' ' + '[/COLOR][/B]- [COLOR white]' + ' - ' + time + ' [/COLOR]','', '', artwork + 'icon.png',description='[COLOR gold][B]' + name_cleaner(name) + ' ' + '[/COLOR][/B]- [COLOR white]' + ' - ' + time + ' [/COLOR]')
    viewsetter.set_view("files")

コード例 #7

0

ファイルを表示

ファイル: default.py プロジェクト: CYBERxNUKE/xbmc-addon

def what_sports():
    link = OPEN_URL('http://www.wheresthematch.com/tv/home.asp').replace('\r', '').replace('\n', '').replace('\t', '')
    match = re.compile('href="http://www.wheresthematch.com/fixtures/(.+?).asp.+?class="">(.+?)</em> <em class="">v</em> <em class="">(.+?)</em>.+?time-channel ">(.+?)</span>').findall(link)
    for game, name1, name2, gametime in match:
        kodi.addItem('[COLOR gold][B]' + game + ' ' + '[/COLOR][/B]- [COLOR white]' + name1 + ' vs ' + name2 + ' - ' + gametime + ' [/COLOR]', '', '', artwork + 'icon.png',
                     description='[COLOR gold][B]'+game+' '+'[/COLOR][/B]- [COLOR white]'+name1+' vs '+name2+' - '+gametime+' [/COLOR]')
        xbmc.executebuiltin("Container.SetViewMode(55)")
    # #######AMERICAN###############
    link = OPEN_URL('http://www.tvguide.com/sports/live-today/').replace('\r', '').replace('\n', '').replace('\t', '')
    sections = dom_parser.parse_dom(link, 'div', {'class': "listings-program-content"})
    listings = dom_parser.parse_dom(sections, 'span', {'class': "listings-program-link"})
    for stuff in sections:
        match = re.compile('class="listings-program-link">(.+?)</span></h3>.+?class="listings-program-link">.+?listings-program-airing-info">(.+?)</p><p.+?description">(.+?)</p>').findall(stuff)
        for name, time, description in match:
            kodi.addItem('[COLOR gold][B]' + name_cleaner(name) + ' ' + '[/COLOR][/B]- [COLOR white]' + ' - ' + time + ' [/COLOR]','', '', artwork + 'icon.png',description='[COLOR gold][B]' + name_cleaner(name) + ' ' + '[/COLOR][/B]- [COLOR white]' + ' - ' + time + ' [/COLOR]')
    viewsetter.set_view("files")

コード例 #8

0

ファイルを表示

ファイル: watchepisodes.py プロジェクト: ItsMYZTIK/tdbaddon

    def get_sources(self, video, video_type):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            page_url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(page_url, cache_limit=.25)
            for link in dom_parser.parse_dom(html, 'div',
                                             {'class': '[^"]*ldr-item[^"]*'}):
                stream_url = dom_parser.parse_dom(link,
                                                  'a',
                                                  ret='data-actuallink')

                views = None
                watched = dom_parser.parse_dom(link, 'div',
                                               {'class': 'click-count'})
                if watched:
                    match = re.search(' (\d+) ', watched[0])
                    if match:
                        views = match.group(1)

                score = dom_parser.parse_dom(link, 'div',
                                             {'class': '\s*point\s*'})
                if score:
                    score = int(score[0])
                    rating = score * 10 if score else None

                if stream_url:
                    stream_url = stream_url[0].strip()
                    host = urlparse.urlparse(stream_url).hostname
                    quality = scraper_utils.get_quality(
                        video, host, QUALITIES.HIGH)
                    #source = {'hostname': 'IceFilms', 'multi-part': False, 'quality': quality, 'class': '','version': label,'rating': None, 'views': None, 'direct': False}
                    hoster = {
                        'hostname': 'WatchEpisodes',
                        'multi-part': False,
                        'host': host,
                        'class': self,
                        'quality': quality,
                        'views': views,
                        'rating': rating,
                        'url': stream_url,
                        'direct': False
                    }
                    hosters.append(hoster)
        main_scrape.apply_urlresolver(hosters)
        return hosters

コード例 #9

0

ファイルを表示

ファイル: putlocker_both.py プロジェクト: ItsMYZTIK/tdbaddon

 def resolve_link(self, link):
     if not link.startswith('http'):
         stream_url = urlparse.urljoin(self.base_url, link)
         html = self._http_get(stream_url, cache_limit=0)
         iframe_url = dom_parser.parse_dom(html, 'iframe', ret='src')
         if iframe_url:
             return iframe_url[0]
     else:
         return link

コード例 #10

0

ファイルを表示

 def resolve_link(self, link):
     if not link.startswith('http'):
         url = urlparse.urljoin(self.base_url, link)
         html = self._http_get(url, cache_limit=0)
         stream_url = dom_parser.parse_dom(html,
                                           'a', {'class': 'myButton p2'},
                                           ret='href')
         if stream_url: return stream_url[0]
     else:
         return link

コード例 #11

0

ファイルを表示

ファイル: watchepisodes.py プロジェクト: ItsMYZTIK/tdbaddon

 def _get_episode_url(self, show_url, video):
     #def _get_episode_url(self, show_url, video):
     url = urlparse.urljoin(self.base_url, show_url)
     html = self._http_get(url, cache_limit=2)
     if html:
         episodes = dom_parser.parse_dom(html, 'div',
                                         {'class': '\s*el-item\s*'})
         episode_pattern = 'href="([^"]*-[sS]%02d[eE]%02d(?!\d)[^"]*)' % (
             int(video.season), int(video.episode))
         match = re.search(episode_pattern, html)
         if match:
             return scraper_utils.pathify_url(match.group(1))

コード例 #12

0

ファイルを表示

    def get_sources(self, video, video_type):
        source_url = self.get_url(video)
        sources = []
        if source_url and source_url != FORCE_NO_MATCH:
            try:
                url = urlparse.urljoin(self.base_url, source_url)
                html = self._http_get(url, cache_limit=2)

                pattern = '<iframe id="videoframe" src="([^"]+)'
                match = re.search(pattern, html)
                url = urlparse.urljoin(self.base_url, match.group(1))
                html = self._http_get(url, cache_limit=.5)

                match = re.search('lastChild\.value="([^"]+)"(?:\s*\+\s*"([^"]+))?', html)
                secret = ''.join(match.groups(''))

                match = re.search('"&t=([^"]+)', html)
                t = match.group(1)

                match = re.search('(?:\s+|,)s\s*=(\d+)', html)
                s_start = int(match.group(1))

                match = re.search('(?:\s+|,)m\s*=(\d+)', html)
                m_start = int(match.group(1))

                for fragment in dom_parser.parse_dom(html, 'div', {'class': 'ripdiv'}):
                    match = re.match('<b>(.*?)</b>', fragment)
                    if match:
                        q_str = match.group(1).replace(' ', '').upper()
                        quality = QUALITY_MAP.get(q_str, QUALITIES.HIGH)
                    else:
                        quality = QUALITIES.HIGH

                    pattern = '''onclick='go\((\d+)\)'>([^<]+)(<span.*?)</a>'''
                    for match in re.finditer(pattern, fragment):
                        link_id, label, host_fragment = match.groups()
                        source = {'hostname':'IceFilms','multi-part': False, 'quality': quality, 'class': '', 'version': label,
                                  'rating': None, 'views': None, 'direct': False}
                        source['host'] = re.sub('(</?[^>]*>)', '', host_fragment)
                        s = s_start + random.randint(3, 1000)
                        m = m_start + random.randint(21, 1000)
                        url = AJAX_URL % (link_id, s, m, secret, t)
                        urls = self.resolve_link(url)
                        source['url'] = urls
                        sources.append(source)
            except Exception as e:
                log_utils.log('Failure (%s) during icefilms get sources: |%s|' % (str(e), video), log_utils.LOGWARNING)

        main_scrape.apply_urlresolver(sources)
        return sources

コード例 #13

0

ファイルを表示

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     search_url = urlparse.urljoin(self.base_url, '/search/')
     search_url += urllib.quote_plus(title)
     html = self._http_get(search_url, require_debrid=True, cache_limit=1)
     if video_type == 'shows':
         seen_urls = {}
         for post in dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'}):
             if 'shows' in post:
                 match = re.search('<span>\s*TAGS:\s*</span>\s*<a\s+href="([^"]+)[^>]+>([^<]+)', post, re.I)
                 if match:
                     show_url, match_title = match.groups()
                     if show_url not in seen_urls:
                         result = {'url': scraper_utils.pathify_url(show_url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''}
                         seen_urls[show_url] = result
                         results.append(result)
     elif video_type == 'movies':
         headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html)
         posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'})
         norm_title = scraper_utils.normalize_title(title)
         for heading, post in zip(headings, posts):
             if 'movies' in post and not self.__too_old(post):
                 post_url, post_title = heading
                 match = re.search('(.*?)\s*[.\[(]?(\d{4})[.)\]]?\s*(.*)', post_title)
                 if match:
                     match_title, match_year, extra_title = match.groups()
                     full_title = '%s [%s]' % (match_title, extra_title)
                 else:
                     full_title = match_title = post_title
                     match_year = ''
                 
                 match_norm_title = scraper_utils.normalize_title(match_title)
                 if (match_norm_title in norm_title or norm_title in match_norm_title) and (not year or not match_year or year == match_year):
                     result = {'url': scraper_utils.pathify_url(post_url), 'title': scraper_utils.cleanse_title(full_title), 'year': match_year}
                     results.append(result)
     
     return results

コード例 #14

0

ファイルを表示

ファイル: putlocker_both.py プロジェクト: ItsMYZTIK/tdbaddon

 def get_sources(self, video, video_type):
     source_url = self.get_url(video)
     hosters = []
     if source_url and source_url != FORCE_NO_MATCH:
         page_url = urlparse.urljoin(self.base_url, source_url)
         html = self._http_get(page_url, cache_limit=.5)
         fragment = dom_parser.parse_dom(html, 'div',
                                         {'class': 'alternativesc'})
         if fragment:
             for item in dom_parser.parse_dom(fragment[0], 'div',
                                              {'class': 'altercolumn'}):
                 link = dom_parser.parse_dom(item,
                                             'a',
                                             {'class': 'altercolumnlink'},
                                             ret='href')
                 host = dom_parser.parse_dom(item, 'span')
                 if link and host:
                     link = link[0]
                     if not link.startswith('http'):
                         link = source_url + link
                     host = host[0]
                     quality = scraper_utils.get_quality(
                         video, host, QUALITIES.HIGH)
                     hoster = {
                         'hostname': 'PutLocker',
                         'multi-part': False,
                         'host': host,
                         'class': '',
                         'quality': quality,
                         'views': None,
                         'rating': None,
                         'url': link,
                         'direct': False
                     }
                     hosters.append(hoster)
     main_scrape.apply_urlresolver(hosters)
     return hosters

コード例 #15

0

ファイルを表示

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     search_url = urlparse.urljoin(
         self.base_url, '/wp-content/themes/afdah/ajax-search.php')
     data = {'search': title, 'type': 'title'}
     html = self._http_get(search_url,
                           data=data,
                           headers=XHR,
                           cache_limit=1)
     for item in dom_parser.parse_dom(html, 'li'):
         match_url = dom_parser.parse_dom(item, 'a', ret='href')
         match_title_year = dom_parser.parse_dom(item, 'a')
         if match_url and match_title_year:
             match_url = match_url[0]
             match_title, match_year = scraper_utils.extra_year(
                 match_title_year[0])
             if not year or not match_year or year == match_year:
                 result = {
                     'url': scraper_utils.pathify_url(match_url),
                     'title': scraper_utils.cleanse_title(match_title),
                     'year': match_year
                 }
                 results.append(result)
     return results

コード例 #16

0

ファイルを表示

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            page_url = urlparse.urljoin(self.base_url, source_url)
            headers = {'Referer': ''}
            html = self._http_get(page_url, headers=headers, cache_limit=.5)
            page_links = []
            for iframe_url in dom_parser.parse_dom(html, 'iframe', ret='src'):
                if 'youtube' not in iframe_url:
                    host = urlparse.urlparse(iframe_url).hostname
                    page_links.append((iframe_url, 'embedded', host))

            page_links += re.findall(
                '<a[^>]+href="([^"]+)[^>]+>(Version \d+)</a>([^<]+)', html)

            for stream_url, version, host in page_links:
                if not stream_url.startswith('http'):
                    url = source_url + stream_url
                    host = host.replace('&nbsp;', '')
                else:
                    url = stream_url
                    host = urlparse.urlparse(stream_url).hostname

                base_quality = QUALITIES.HD720 if version == 'embedded' else QUALITIES.HIGH
                hoster = {
                    'hostname': 'Putlocker',
                    'multi-part': False,
                    'host': host,
                    'class': self,
                    'quality':
                    scraper_utils.get_quality(video, host, base_quality),
                    'views': None,
                    'rating': None,
                    'url': url,
                    'direct': False
                }
                hoster['version'] = '(%s)' % (version)
                hosters.append(hoster)

        fullsource = main_scrape.apply_urlresolver(hosters)
        return fullsource

コード例 #17

0

ファイルを表示

ファイル: github_api.py プロジェクト: staycanuca/polux

def web_search(q):
    from HTMLParser import HTMLParser

    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.fed = []

        def handle_data(self, d):
            self.fed.append(d)

        def get_data(self):
            return ''.join(self.fed)

    def strip_tags(html):
        s = MLStripper()
        s.feed(html)
        return s.get_data()

    base_url = "https://github.com/search"
    q = "%s extension:zip language:Python path:addon.xml language:Python" % q
    params = {"q": q, "type": "Repositories", "ref": "advsearch"}
    results = {"items": []}
    r = requests.get(base_url, params=params)
    links = dom_parser.parse_dom(r.text, 'a', {"class": "v-align-middle"})
    for link in links:
        link = strip_tags(link)
        temp = link.split("/")
        results["items"] += [{
            "owner": {
                "login": temp[0]
            },
            "name": temp[1],
            "full_name": link
        }]
    return results