def _get_episode_url(self, show_url, video): force_title = scraper_utils.force_title(video) title_fallback = kodi.get_setting('title-fallback') == 'true' norm_title = scraper_utils.normalize_title(video.ep_title) page_url = [show_url] too_old = False while page_url and not too_old: url = urlparse.urljoin(self.base_url, page_url[0]) html = self._http_get(url, require_debrid=True, cache_limit=1) posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'}) for post in posts: if self.__too_old(post): too_old = True break if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post: match = re.search('<a\s+href="([^"]+)[^>]+>(.*?)</a>', post) if match: url, title = match.groups() if not force_title: if scraper_utils.release_check(video, title, require_title=False): return scraper_utils.pathify_url(url) else: if title_fallback and norm_title: match = re.search('</strong>(.*?)</p>', post) if match and norm_title == scraper_utils.normalize_title(match.group(1)): return scraper_utils.pathify_url(url) page_url = dom_parser.parse_dom(html, 'a', {'class': 'nextpostslink'}, ret='href')
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] headers = {'Referer': self.base_url} params = {'search': title} html = self._http_get(self.base_url, params=params, headers=headers, cache_limit=8) for item in dom_parser.parse_dom(html, 'div', {'class': 'listCard'}): match_title = dom_parser.parse_dom(item, 'p', {'class': 'extraTitle'}) match_url = dom_parser.parse_dom(item, 'a', ret='href') match_year = dom_parser.parse_dom(item, 'p', {'class': 'cardYear'}) if match_url and match_title: match_url = match_url[0] match_title = match_title[0] match_year = match_year[0] if match_year else '' if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def get_sources(self, video, video_type): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) headers = {'Refer': self.base_url} html = self._http_get(page_url, headers=headers, cache_limit=.5) table = dom_parser.parse_dom(html, 'div', {'class': 'linktable'}) if table: for row in dom_parser.parse_dom(table[0], 'tr'): spans = dom_parser.parse_dom(row, 'span') stream_url = dom_parser.parse_dom(row, 'a', ret='href') is_sponsored = any( [i for i in spans if 'sponsored' in i.lower()]) if not is_sponsored and len(spans) > 1 and stream_url: host, rating = spans[0], spans[1] stream_url = stream_url[0] quality = scraper_utils.get_quality( video, host, QUALITIES.HIGH) hoster = { 'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': False } if 'rating'.endswith('%') and rating[:-1].isdigit(): hoster['rating'] = rating[:-1] hosters.append(hoster) return hosters
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin( self.base_url, '/?s=%s&submit=Search+Now!' % (urllib.quote_plus(title))) headers = {'Referer': search_url} html = self._http_get(search_url, headers=headers, cache_limit=8) index = 0 if video_type == 'shows' else 1 fragments = re.findall('<h2.*?(?=<h2|$)', html, re.DOTALL) if len(fragments) > index: for item in dom_parser.parse_dom(fragments[index], 'div', {'class': 'aaa_item'}): match_title_year = dom_parser.parse_dom(item, 'a', ret='title') match_url = dom_parser.parse_dom(item, 'a', ret='href') if match_title_year and match_url: match_url = match_url[0] match_title_year = match_title_year[0] match = re.search('(.*?)\s+\((\d{4})\)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def get_sources(self, video, video_type): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) fragment = dom_parser.parse_dom(html, 'table', {'id': 'streamlinks'}) if fragment: max_age = 0 now = min_age = int(time.time()) for row in dom_parser.parse_dom(fragment[0], 'tr', {'id': 'pt\d+'}): if video_type == 'movies': pattern = 'href="([^"]+).*?/>([^<]+).*?(?:<td>.*?</td>\s*){1}<td>(.*?)</td>\s*<td>(.*?)</td>' else: pattern = 'href="([^"]+).*?/>([^<]+).*?(<span class="linkdate">.*?)</td>\s*<td>(.*?)</td>' match = re.search(pattern, row, re.DOTALL) if match: url, host, age, quality = match.groups() age = self.__get_age(now, age) quality = quality.upper() if age > max_age: max_age = age if age < min_age: min_age = age host = host.strip() hoster = { 'hostname': 'iWatchOnline', 'multi-part': False, 'class': '', 'url': self.resolve_link(url), 'host': host, 'age': age, 'views': None, 'rating': None, 'direct': False } hoster['quality'] = scraper_utils.get_quality( video, host, QUALITY_MAP.get(quality, QUALITIES.HIGH)) hosters.append(hoster) unit = (max_age - min_age) / 100 if unit > 0: for hoster in hosters: hoster['rating'] = (hoster['age'] - min_age) / unit main_scrape.apply_urlresolver(hosters) return hosters
def what_sports(): link = OPEN_URL('http://www.wheresthematch.com/tv/home.asp').replace('\r', '').replace('\n', '').replace('\t', '') match = re.compile('href="http://www.wheresthematch.com/fixtures/(.+?).asp.+?class="">(.+?)</em> <em class="">v</em> <em class="">(.+?)</em>.+?time-channel ">(.+?)</span>').findall(link) for game, name1, name2, gametime in match: kodi.addItem('[COLOR gold][B]' + game + ' ' + '[/COLOR][/B]- [COLOR white]' + name1 + ' vs ' + name2 + ' - ' + gametime + ' [/COLOR]', '', '', artwork + 'icon.png', description='[COLOR gold][B]'+game+' '+'[/COLOR][/B]- [COLOR white]'+name1+' vs '+name2+' - '+gametime+' [/COLOR]') xbmc.executebuiltin("Container.SetViewMode(55)") # #######AMERICAN############### link = OPEN_URL('http://www.tvguide.com/sports/live-today/').replace('\r', '').replace('\n', '').replace('\t', '') sections = dom_parser.parse_dom(link, 'div', {'class': "listings-program-content"}) listings = dom_parser.parse_dom(sections, 'span', {'class': "listings-program-link"}) for stuff in sections: match = re.compile('class="listings-program-link">(.+?)</span></h3>.+?class="listings-program-link">.+?listings-program-airing-info">(.+?)</p><p.+?description">(.+?)</p>').findall(stuff) for name, time, description in match: kodi.addItem('[COLOR gold][B]' + name_cleaner(name) + ' ' + '[/COLOR][/B]- [COLOR white]' + ' - ' + time + ' [/COLOR]','', '', artwork + 'icon.png',description='[COLOR gold][B]' + name_cleaner(name) + ' ' + '[/COLOR][/B]- [COLOR white]' + ' - ' + time + ' [/COLOR]') viewsetter.set_view("files")
def get_sources(self, video, video_type): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=.25) for link in dom_parser.parse_dom(html, 'div', {'class': '[^"]*ldr-item[^"]*'}): stream_url = dom_parser.parse_dom(link, 'a', ret='data-actuallink') views = None watched = dom_parser.parse_dom(link, 'div', {'class': 'click-count'}) if watched: match = re.search(' (\d+) ', watched[0]) if match: views = match.group(1) score = dom_parser.parse_dom(link, 'div', {'class': '\s*point\s*'}) if score: score = int(score[0]) rating = score * 10 if score else None if stream_url: stream_url = stream_url[0].strip() host = urlparse.urlparse(stream_url).hostname quality = scraper_utils.get_quality( video, host, QUALITIES.HIGH) #source = {'hostname': 'IceFilms', 'multi-part': False, 'quality': quality, 'class': '','version': label,'rating': None, 'views': None, 'direct': False} hoster = { 'hostname': 'WatchEpisodes', 'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': views, 'rating': rating, 'url': stream_url, 'direct': False } hosters.append(hoster) main_scrape.apply_urlresolver(hosters) return hosters
def resolve_link(self, link): if not link.startswith('http'): stream_url = urlparse.urljoin(self.base_url, link) html = self._http_get(stream_url, cache_limit=0) iframe_url = dom_parser.parse_dom(html, 'iframe', ret='src') if iframe_url: return iframe_url[0] else: return link
def resolve_link(self, link): if not link.startswith('http'): url = urlparse.urljoin(self.base_url, link) html = self._http_get(url, cache_limit=0) stream_url = dom_parser.parse_dom(html, 'a', {'class': 'myButton p2'}, ret='href') if stream_url: return stream_url[0] else: return link
def _get_episode_url(self, show_url, video): #def _get_episode_url(self, show_url, video): url = urlparse.urljoin(self.base_url, show_url) html = self._http_get(url, cache_limit=2) if html: episodes = dom_parser.parse_dom(html, 'div', {'class': '\s*el-item\s*'}) episode_pattern = 'href="([^"]*-[sS]%02d[eE]%02d(?!\d)[^"]*)' % ( int(video.season), int(video.episode)) match = re.search(episode_pattern, html) if match: return scraper_utils.pathify_url(match.group(1))
def get_sources(self, video, video_type): source_url = self.get_url(video) sources = [] if source_url and source_url != FORCE_NO_MATCH: try: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=2) pattern = '<iframe id="videoframe" src="([^"]+)' match = re.search(pattern, html) url = urlparse.urljoin(self.base_url, match.group(1)) html = self._http_get(url, cache_limit=.5) match = re.search('lastChild\.value="([^"]+)"(?:\s*\+\s*"([^"]+))?', html) secret = ''.join(match.groups('')) match = re.search('"&t=([^"]+)', html) t = match.group(1) match = re.search('(?:\s+|,)s\s*=(\d+)', html) s_start = int(match.group(1)) match = re.search('(?:\s+|,)m\s*=(\d+)', html) m_start = int(match.group(1)) for fragment in dom_parser.parse_dom(html, 'div', {'class': 'ripdiv'}): match = re.match('<b>(.*?)</b>', fragment) if match: q_str = match.group(1).replace(' ', '').upper() quality = QUALITY_MAP.get(q_str, QUALITIES.HIGH) else: quality = QUALITIES.HIGH pattern = '''onclick='go\((\d+)\)'>([^<]+)(<span.*?)</a>''' for match in re.finditer(pattern, fragment): link_id, label, host_fragment = match.groups() source = {'hostname':'IceFilms','multi-part': False, 'quality': quality, 'class': '', 'version': label, 'rating': None, 'views': None, 'direct': False} source['host'] = re.sub('(</?[^>]*>)', '', host_fragment) s = s_start + random.randint(3, 1000) m = m_start + random.randint(21, 1000) url = AJAX_URL % (link_id, s, m, secret, t) urls = self.resolve_link(url) source['url'] = urls sources.append(source) except Exception as e: log_utils.log('Failure (%s) during icefilms get sources: |%s|' % (str(e), video), log_utils.LOGWARNING) main_scrape.apply_urlresolver(sources) return sources
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = urlparse.urljoin(self.base_url, '/search/') search_url += urllib.quote_plus(title) html = self._http_get(search_url, require_debrid=True, cache_limit=1) if video_type == 'shows': seen_urls = {} for post in dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'}): if 'shows' in post: match = re.search('<span>\s*TAGS:\s*</span>\s*<a\s+href="([^"]+)[^>]+>([^<]+)', post, re.I) if match: show_url, match_title = match.groups() if show_url not in seen_urls: result = {'url': scraper_utils.pathify_url(show_url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''} seen_urls[show_url] = result results.append(result) elif video_type == 'movies': headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html) posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'}) norm_title = scraper_utils.normalize_title(title) for heading, post in zip(headings, posts): if 'movies' in post and not self.__too_old(post): post_url, post_title = heading match = re.search('(.*?)\s*[.\[(]?(\d{4})[.)\]]?\s*(.*)', post_title) if match: match_title, match_year, extra_title = match.groups() full_title = '%s [%s]' % (match_title, extra_title) else: full_title = match_title = post_title match_year = '' match_norm_title = scraper_utils.normalize_title(match_title) if (match_norm_title in norm_title or norm_title in match_norm_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(post_url), 'title': scraper_utils.cleanse_title(full_title), 'year': match_year} results.append(result) return results
def get_sources(self, video, video_type): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=.5) fragment = dom_parser.parse_dom(html, 'div', {'class': 'alternativesc'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'div', {'class': 'altercolumn'}): link = dom_parser.parse_dom(item, 'a', {'class': 'altercolumnlink'}, ret='href') host = dom_parser.parse_dom(item, 'span') if link and host: link = link[0] if not link.startswith('http'): link = source_url + link host = host[0] quality = scraper_utils.get_quality( video, host, QUALITIES.HIGH) hoster = { 'hostname': 'PutLocker', 'multi-part': False, 'host': host, 'class': '', 'quality': quality, 'views': None, 'rating': None, 'url': link, 'direct': False } hosters.append(hoster) main_scrape.apply_urlresolver(hosters) return hosters
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = urlparse.urljoin( self.base_url, '/wp-content/themes/afdah/ajax-search.php') data = {'search': title, 'type': 'title'} html = self._http_get(search_url, data=data, headers=XHR, cache_limit=1) for item in dom_parser.parse_dom(html, 'li'): match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title_year = dom_parser.parse_dom(item, 'a') if match_url and match_title_year: match_url = match_url[0] match_title, match_year = scraper_utils.extra_year( match_title_year[0]) if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) headers = {'Referer': ''} html = self._http_get(page_url, headers=headers, cache_limit=.5) page_links = [] for iframe_url in dom_parser.parse_dom(html, 'iframe', ret='src'): if 'youtube' not in iframe_url: host = urlparse.urlparse(iframe_url).hostname page_links.append((iframe_url, 'embedded', host)) page_links += re.findall( '<a[^>]+href="([^"]+)[^>]+>(Version \d+)</a>([^<]+)', html) for stream_url, version, host in page_links: if not stream_url.startswith('http'): url = source_url + stream_url host = host.replace(' ', '') else: url = stream_url host = urlparse.urlparse(stream_url).hostname base_quality = QUALITIES.HD720 if version == 'embedded' else QUALITIES.HIGH hoster = { 'hostname': 'Putlocker', 'multi-part': False, 'host': host, 'class': self, 'quality': scraper_utils.get_quality(video, host, base_quality), 'views': None, 'rating': None, 'url': url, 'direct': False } hoster['version'] = '(%s)' % (version) hosters.append(hoster) fullsource = main_scrape.apply_urlresolver(hosters) return fullsource
def web_search(q): from HTMLParser import HTMLParser class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def get_data(self): return ''.join(self.fed) def strip_tags(html): s = MLStripper() s.feed(html) return s.get_data() base_url = "https://github.com/search" q = "%s extension:zip language:Python path:addon.xml language:Python" % q params = {"q": q, "type": "Repositories", "ref": "advsearch"} results = {"items": []} r = requests.get(base_url, params=params) links = dom_parser.parse_dom(r.text, 'a', {"class": "v-align-middle"}) for link in links: link = strip_tags(link) temp = link.split("/") results["items"] += [{ "owner": { "login": temp[0] }, "name": temp[1], "full_name": link }] return results