def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = '/search/' + urllib.quote_plus(title) html = self._http_get(search_url, require_debrid=True, cache_limit=1) if video_type == VIDEO_TYPES.TVSHOW: seen_urls = {} for _attr, post in dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')}): if CATEGORIES[video_type] not in post: continue match = re.search('<span>\s*TAGS:\s*</span>\s*<a\s+href="([^"]+)[^>]+>([^<]+)', post, re.I) if match: show_url, match_title = match.groups() if show_url in seen_urls: continue result = {'url': scraper_utils.pathify_url(show_url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''} seen_urls[show_url] = result results.append(result) elif video_type == VIDEO_TYPES.MOVIE: norm_title = scraper_utils.normalize_title(title) headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html) posts = [result.content for result in dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')})] for heading, post in zip(headings, posts): if CATEGORIES[video_type] not in post or self.__too_old(post): continue post_url, post_title = heading meta = scraper_utils.parse_movie_link(post_title) full_title = '%s [%s] (%sp)' % (meta['title'], meta['extra'], meta['height']) match_year = meta['year'] match_norm_title = scraper_utils.normalize_title(meta['title']) if (match_norm_title in norm_title or norm_title in match_norm_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(post_url), 'title': scraper_utils.cleanse_title(full_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/index.php?search=%s&image.x=0&image.y=0') search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=.25) results = [] # Are we on a results page? if not re.search('window\.location', html): pattern = '<td[^>]+class="movieText"[^>]*>(.*?)</p>.*?href="(/watch/[^"]+)' for match in re.finditer(pattern, html, re.DOTALL): match_title_year, match_url = match.groups('') # skip p**n if '-XXX-' in match_url.upper() or ' XXX:' in match_title_year: continue match_title_year = re.sub('</?.*?>', '', match_title_year) match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = {'url': match_url, 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) else: match = re.search('window\.location\s+=\s+"([^"]+)', html) if match: url = match.group(1) if url != 'movies.php': result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': year} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/index.php') params = {'search': title, 'image.x': 0, 'image.y': 0} html = self._http_get(search_url, params=params, cache_limit=1) # Are we on a results page? if not re.search('window\.location', html): pattern = '<td[^>]+class="movieText"[^>]*>(.*?)</p>.*?href="(/watch/[^"]+)' for match in re.finditer(pattern, html, re.DOTALL): match_title_year, match_url = match.groups('') # skip p**n if '-XXX-' in match_url.upper() or ' XXX:' in match_title_year: continue match_title_year = re.sub('</?.*?>', '', match_title_year) match_title, match_year = scraper_utils.extra_year(match_title_year) if not year or not match_year or year == match_year: result = {'url': match_url, 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) else: match = re.search('window\.location\s+=\s+"([^"]+)', html) if match: url = match.group(1) if url != 'movies.php': result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': year} results.append(result) return results
def __get_ok(self, embed, flashvars): hosters = [] link = flashvars[0].attrs['value'] match = re.search('metadataUrl=([^"]+)', link) if match: referer = scraper_utils.cleanse_title(urllib.unquote(embed[0].attrs['data'])) ok_url = scraper_utils.cleanse_title(urllib.unquote(match.group(1))) html = self._http_get(ok_url, data='ok', headers={'Referer': referer}, cache_limit=.25) js_data = scraper_utils.parse_json(html, ok_url) stream_url = js_data.get('movie', {}).get('url') if stream_url is not None: host = urlparse.urlparse(stream_url).hostname hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': QUALITIES.HD720, 'views': None, 'rating': None, 'url': stream_url, 'direct': False, 'subs': 'Turkish Subtitles'} hosters.append(hoster) return hosters
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, SEARCH_URL) search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, headers=XHR, cache_limit=1) for film in dom_parser.parse_dom(html, 'li', {'class': 'films-item'}): match_url = dom_parser.parse_dom(film, 'a', ret='href') match_title = dom_parser.parse_dom(film, 'div', {'class': 'films-item-title'}) match_year = dom_parser.parse_dom(film, 'div', {'class': 'films-item-year'}) if match_url and match_title: match_url = match_url[0] match_title = match_title[0] match_title = re.sub('</?span>', '', match_title) if match_year: match = re.search('(\d+)', match_year[0]) if match: match_year = match.group(1) else: match_year = '' else: match_year = '' if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': match_url} results.append(result) return results
def __alt_search(self, video_type, title, year, season=''): results = [] params = title.lower() if year: params += ' %s' % (year) if video_type == VIDEO_TYPES.SEASON and season: params += ' Season %s' % (season) params = {'key': params} search_url = urlparse.urljoin(self.base_url, '/search') html = self._http_get(search_url, params=params, cache_limit=1) norm_title = scraper_utils.normalize_title(title) for item in dom_parser.parse_dom(html, 'div', {'class': 'caption'}): match = re.search('href="([^"]+)[^>]+>(.*?)<span[^>]*>', item) if match: match_url, match_title = match.groups() is_season = re.search('-season-\d+', match_url) if (video_type == VIDEO_TYPES.MOVIE and not is_season) or (video_type == VIDEO_TYPES.SEASON and is_season): if video_type == VIDEO_TYPES.SEASON: if season and not re.search('season-0*%s$' % (season), match_url): continue match_title = re.sub('</?[^>]*>', '', match_title) match_title = re.sub('\s+Full\s+Movie', '', match_title) match = re.search('-(\d{4})(?:$|-)', match_url) if match: match_year = match.group(1) else: match_year = '' if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year): result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def __search(self, video_type, title, year, season=''): results = [] search_url = base64.decodestring(SEARCH_URL) % (urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=1) js_data = scraper_utils.parse_json(html) norm_title = scraper_utils.normalize_title(title) for item in js_data.get('results', []): if '/watch/' not in item['url'].lower(): continue is_season = re.search('Season\s+(\d+)', item['titleNoFormatting'], re.IGNORECASE) if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (is_season and video_type == VIDEO_TYPES.SEASON): match_title_year = item['titleNoFormatting'] match_title_year = re.sub('^Watch\s+', '', match_title_year) match_url = item['url'] match_year = '' if video_type == VIDEO_TYPES.MOVIE: match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year else: if season and int(is_season.group(1)) != int(season): continue match = re.search('(.*?)\s+\(\d{4}\)', match_title_year) if match: match_title = match.group(1) else: match_title = match_title_year if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year): result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = self.base_url if video_type in [VIDEO_TYPES.TVSHOW, VIDEO_TYPES.EPISODE]: search_url += '/tvshow' search_url += '/advanced-search.php?search=' search_url += urllib.quote_plus(title) search_url += '&year=' + urllib.quote_plus(str(year)) search_url += '&advanced_search=Search' html = self._http_get(search_url, cache_limit=.25) results = [] for element in dom_parser.parse_dom(html, 'div', {'class': 'list_box_title'}): match = re.search('href="([^"]+)"\s+title="(?:Watch )?([^"]+)', element) if match: url, match_title_year = match.groups() match = re.search('(.*?)(?:\s+\(?\s*(\d{4})\s*\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=8) title_strip = [word.decode('utf-8') for word in TITLE_STRIP] for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}): match_url = re.search('href="([^"]+)', item) match_title = dom_parser.parse_dom(item, 'span', {'class': 'tt'}) if match_url and match_title: item_type = dom_parser.parse_dom(item, 'span', {'class': 'calidad2'}) if item_type and item_type[0] in SEARCH_EXCLUDE: continue match_url = match_url.group(1) match_title = match_title[0] if 'SEZON' in match_title.upper(): continue year_frag = dom_parser.parse_dom(item, 'span', {'class': 'year'}) if year_frag: match_year = year_frag[0] else: match_year = '' match_title = ' '.join([word for word in match_title.split() if word.upper() not in title_strip]) if (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def __movie_search(self, title, year): results = [] search_url = urlparse.urljoin(self.base_url, '/search?q=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) norm_title = scraper_utils.normalize_title(title) for item in dom_parser.parse_dom(html, 'div', {'class': 'video_item'}): match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title = dom_parser.parse_dom(item, 'img', ret='alt') match_year = '' if match_url and match_title: match_url = match_url[0] match_title = match_title[0] if match_year: match_year = match_year[0] else: match_year = '' if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/results') params = {'q': title} referer = search_url + '?' + urllib.urlencode(params) headers = {'Referer': referer} headers.update(XHR) _html = self._http_get(scraper_utils.urljoin(self.base_url, 'av'), headers=headers, method='POST', cache_limit=0) cookies = {'begin_referer': referer, 'prounder': 1} html = self._http_get(search_url, params=params, cookies=cookies, cache_limit=8) if any('jquery.js' in match.attrs['src'] for match in dom_parser2.parse_dom(html, 'script', req='src')): html = self._http_get(search_url, params=params, cookies=cookies, cache_limit=0) for _attrs, result in dom_parser2.parse_dom(html, 'div', {'class': 'cell'}): title_frag = dom_parser2.parse_dom(result, 'div', {'class': 'video_title'}) year_frag = dom_parser2.parse_dom(result, 'div', {'class': 'video_quality'}) if not title_frag: continue match = dom_parser2.parse_dom(title_frag[0].content, 'a', req='href') if not match: continue match_url = match[0].attrs['href'] match_title = match[0].content try: match = re.search('\s+(\d{4})\s+', year_frag[0].content) match_year = match.group(1) except: match_year = '' if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=""): if not self.include_paid and video_type != VIDEO_TYPES.MOVIE: return [] search_url = urlparse.urljoin(self.base_url, "/search.php") html = self._http_get(search_url, params={"q": title}, cache_limit=0.25) results = [] if video_type == VIDEO_TYPES.MOVIE: pattern = "<i>\s*Movies\s*</i>(.*)" else: pattern = "<i>\s*TV Series\s*</i>(.*)" match = re.search(pattern, html) if match: container = match.group(1) pattern = "href='([^']+)'>([^<]+)\s*</a>\s*(?:\((\d{4})\))?" for match in re.finditer(pattern, container): url, match_title, match_year = match.groups("") if not year or not match_year or year == match_year: result = { "url": scraper_utils.pathify_url(url), "title": scraper_utils.cleanse_title(match_title), "year": match_year, } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/search-movies/%s.html' % (urllib.quote_plus(title))) html = self._http_get(search_url, cache_limit=8) for _attrs, item in dom_parser2.parse_dom(html, 'li', {'class': 'item'}): match_url = dom_parser2.parse_dom(item, 'a', req='href') match_title_year = re.search('onmouseover="([^"]+)', item) if match_url and match_title_year: match_url = match_url[0].attrs['href'] match_title_year = match_title_year.group(1) match = re.search('<b>(?:<i>)?\s*(.*?)\s*(?:</i>)?</b>', match_title_year) if not match: continue match_title, match_year = scraper_utils.extra_year(match.group(1)) is_season = re.search('season\s+(\d+)', match_title_year, re.I) if (is_season and video_type == VIDEO_TYPES.MOVIE) or (not is_season and video_type == VIDEO_TYPES.SEASON): continue if video_type == VIDEO_TYPES.MOVIE: if not match_year: match_year = re.search('>Release:\s*(\d{4})', match_title_year) match_year = match_year.group(1) if match_year else '' else: if season and int(season) != int(is_season.group(1)): continue if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search/%s.html') search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=1) results = [] fragment = dom_parser.parse_dom(html, 'div', {'class': 'movie'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'li'): match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title = dom_parser.parse_dom(item, 'span', {'class': 'text'}) match_year = dom_parser.parse_dom(item, 'span', {'class': 'year'}) if match_url and match_title: match_url = match_url[0] match_title = re.sub('</?strong>', '', match_title[0]) is_season = re.search('Season\s+(\d+)$', match_title, re.I) if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (is_season and video_type == VIDEO_TYPES.SEASON): if video_type == VIDEO_TYPES.MOVIE: if match_year: match_year = match_year[0] else: match_year = '' else: if season and int(is_season.group(1)) != int(season): continue match_year = '' result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search/%s' % (urllib.quote_plus(title))) html = self._http_get(search_url, cache_limit=.25) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'name_top'}): match = re.search('href="([^"]+)[^>]+>([^<]+)', item) if match: url, match_title_year = match.groups() is_season = re.search('Season\s+(\d+)', match_title_year, re.IGNORECASE) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: match_year = '' if video_type == VIDEO_TYPES.MOVIE: match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year else: match_title = match_title_year if season and int(is_season.group(1)) != int(season): continue if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url)} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] html = self._http_get(self.base_url, params={'s': title}, cache_limit=8) for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}): match = re.search('href="([^"]+)', item) match_title = dom_parser.parse_dom(item, 'span', {'class': 'tt'}) year_frag = dom_parser.parse_dom(item, 'span', {'class': 'year'}) if match and match_title: url = match.group(1) match_title = match_title[0] if re.search('\d+\s*x\s*\d+', match_title): continue # exclude episodes match_title, match_year = scraper_utils.extra_year(match_title) if not match_year and year_frag: match_year = year_frag[0] match = re.search('(.*?)\s+\d{3,}p', match_title) if match: match_title = match.group(1) extra = dom_parser.parse_dom(item, 'span', {'class': 'calidad2'}) if extra: match_title += ' [%s]' % (extra[0]) if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url)} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/movie/search/') title = re.sub('[^A-Za-z0-9 ]', '', title) search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'ml-item'}): match_title = dom_parser.parse_dom(item, 'span', {'class': 'mli-info'}) match_url = re.search('href="([^"]+)', item, re.DOTALL) match_year = re.search('class="jt-info">(\d{4})<', item) is_episodes = dom_parser.parse_dom(item, 'span', {'class': 'mli-eps'}) if (video_type == VIDEO_TYPES.MOVIE and not is_episodes) or (video_type == VIDEO_TYPES.SEASON and is_episodes): if match_title and match_url: match_title = match_title[0] match_title = re.sub('</?h2>', '', match_title) match_title = re.sub('\s+\d{4}$', '', match_title) if video_type == VIDEO_TYPES.SEASON: if season and not re.search('Season\s+%s$' % (season), match_title): continue url = urlparse.urljoin(match_url.group(1), 'watching.html') match_year = match_year.group(1) if match_year else '' if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url)} results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/movies.php') cookies = {'onlylanguage': 'en', 'lang': 'en'} params = {'list': 'search', 'search': title} html = self._http_get(search_url, params=params, cookies=cookies, cache_limit=8) for _attrs, content in dom_parser2.parse_dom(html, 'TR', {'id': re.compile('coverPreview\d+')}): match = dom_parser2.parse_dom(content, 'a', req='href') if not match: continue match_url, match_title = match[0].attrs['href'], match[0].content is_show = re.search('\(tvshow\)', match_title, re.I) if (video_type == VIDEO_TYPES.MOVIE and is_show) or (video_type == VIDEO_TYPES.TVSHOW and not is_show): continue match_title = match_title.replace('(TVshow)', '') match_title = match_title.strip() match_year = '' for _attrs, div in dom_parser2.parse_dom(content, 'div'): match = re.match('\s*(\d{4})\s*', div) if match: match_year = match.group(1) if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search-movies/%s.html') search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=0) results = [] for thumb in dom_parser.parse_dom(html, 'div', {'class': 'thumb'}): match_title = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='title') url = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='href') if match_title and url: match_title, url = match_title[0], url[0] is_season = re.search('Season\s+(\d+)$', match_title, re.I) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: match_year = '' if video_type == VIDEO_TYPES.MOVIE: match_year = dom_parser.parse_dom(thumb, 'div', {'class': '[^"]*status-year[^"]*'}) if match_year: match_year = match_year[0] else: if season and int(is_season.group(1)) != int(season): continue if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = scraper_utils.urljoin(self.base_url, '/search/%s.html' % (urllib.quote_plus(title))) html = self._http_get(search_url, cache_limit=1) fragment = dom_parser2.parse_dom(html, 'ul', {'class': 'cfv'}) if not fragment: return results norm_title = scraper_utils.normalize_title(title) for _attrs, item in dom_parser2.parse_dom(fragment[0].content, 'li'): is_season = dom_parser2.parse_dom(item, 'div', {'class': 'status'}) if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (is_season and video_type == VIDEO_TYPES.SEASON): match = dom_parser2.parse_dom(item, 'a', req=['href', 'title']) if not match: continue match_title = match[0].attrs['title'] match_url = match[0].attrs['href'] match_year = '' if video_type == VIDEO_TYPES.SEASON: if season and not re.search('Season\s+%s$' % (season), match_title, re.I): continue else: match = re.search('-(\d{4})[-.]', match_url) if match: match_year = match.group(1) match_norm_title = scraper_utils.normalize_title(match_title) title_match = (norm_title in match_norm_title) or (match_norm_title in norm_title) if title_match and (not year or not match_year or year == match_year): result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/search/%s.html' % (urllib.quote_plus(title))) html = self._http_get(search_url, cache_limit=1) fragment = dom_parser.parse_dom(html, 'ul', {'class': 'cfv'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'li'): is_season = dom_parser.parse_dom(item, 'div', {'class': 'status'}) if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (is_season and video_type == VIDEO_TYPES.SEASON): match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title = dom_parser.parse_dom(item, 'a', ret='title') if match_url and match_title: match_title = match_title[0] match_url = match_url[0] match_year = '' if video_type == VIDEO_TYPES.SEASON: if season and not re.search('Season\s+%s$' % (season), match_title, re.I): continue else: match = re.search('-(\d{4})\.html', match_url) if match: match_year = match.group(1) if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/index.php?menu=search&query=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=.25) results = [] sections = {VIDEO_TYPES.MOVIE: 'movies', VIDEO_TYPES.TVSHOW: 'series'} fragment = dom_parser.parse_dom(html, 'div', {'id': sections[video_type]}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'figcaption'): match = re.search('title="([^"]+)[^>]+href="([^"]+)', item) if match: match_title_year, url = match.groups() match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if match_title.startswith('Watch '): match_title = match_title.replace('Watch ', '') if match_title.endswith(' Online'): match_title = match_title.replace(' Online', '') if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'url': scraper_utils.pathify_url(url), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/index.php?search_keywords=') search_url += urllib.quote_plus(title) search_url += '&year=' + urllib.quote_plus(str(year)) if video_type in [VIDEO_TYPES.TVSHOW, VIDEO_TYPES.EPISODE]: search_url += '&search_section=2' else: search_url += '&search_section=1' results = [] html = self. _http_get(self.base_url, cache_limit=0) match = re.search('input type="hidden" name="key" value="([0-9a-f]*)"', html) if match: key = match.group(1) search_url += '&key=' + key html = self._http_get(search_url, cache_limit=.25) pattern = r'class="index_item.+?href="(.+?)" title="Watch (.+?)"?\(?([0-9]{4})?\)?"?>' for match in re.finditer(pattern, html): url, title, year = match.groups('') result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': year} results.append(result) else: log_utils.log('Unable to locate PW search key', log_utils.LOGWARNING) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search.php?q=%s&limit=20×tamp=%s' % (urllib.quote_plus(title), int(time.time()))) html = self._http_get(search_url, cache_limit=.25) results = [] items = dom_parser.parse_dom(html, 'li') if len(items) >= 2: items = items[1:] for item in items: match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title_year = dom_parser.parse_dom(item, 'strong') if match_url and match_title_year: match_url = match_url[0] match_title_year = re.sub('</?strong>', '', match_title_year[0]) is_season = re.search('S(?:eason\s+)?(\d+)$', match_title_year, re.I) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: if video_type == VIDEO_TYPES.MOVIE: match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' else: if season and int(is_season.group(1)) != int(season): continue match_title = match_title_year match_year = '' result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/search/') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) for fragment in dom_parser.parse_dom(html, 'div', {'class': 'inner'}): name = dom_parser.parse_dom(fragment, 'div', {'class': 'name'}) if name: match = re.search('href="([^"]+)[^>]+>(.*?)</a>', name[0]) if match: match_url, match_title_year = match.groups() if 'tv-series' in match_url and video_type == VIDEO_TYPES.MOVIE: continue match_title_year = re.sub('</?[^>]*>', '', match_title_year) match_title_year = re.sub('[Ww]atch\s+[Mm]ovie\s*', '', match_title_year) match_title_year = match_title_year.replace('’', "'") match = re.search('(.*?)\s+\((\d{4})[^)]*\)$', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not match_year: year_span = dom_parser.parse_dom(fragment, 'span', {'class': 'year'}) if year_span: year_text = dom_parser.parse_dom(year_span[0], 'a') if year_text: match_year = year_text[0].strip() if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'url': scraper_utils.pathify_url(match_url), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search/?q=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=8) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'ml-item'}): match_title = dom_parser.parse_dom(item, 'span', {'class': 'mli-info'}) match_url = re.search('href="([^"]+)', item, re.DOTALL) year_frag = dom_parser.parse_dom(item, 'img', ret='alt') is_episodes = dom_parser.parse_dom(item, 'span', {'class': 'mli-eps'}) if (video_type == VIDEO_TYPES.MOVIE and not is_episodes) or (video_type == VIDEO_TYPES.SEASON and is_episodes): if match_title and match_url: match_url = match_url.group(1) match_title = match_title[0] match_title = re.sub('</?h2>', '', match_title) match_title = re.sub('\s+\d{4}$', '', match_title) if video_type == VIDEO_TYPES.SEASON: if season and not re.search('Season\s+%s$' % (season), match_title): continue if not match_url.endswith('/'): match_url += '/' match_url = urlparse.urljoin(match_url, 'watch/') match_year = '' if video_type == VIDEO_TYPES.MOVIE and year_frag: match = re.search('\s*-\s*(\d{4})$', year_frag[0]) if match: match_year = match.group(1) if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] if video_type == VIDEO_TYPES.TVSHOW: url = urlparse.urljoin(self.base_url, '/series/all/') html = self._http_get(url, cache_limit=8) links = dom_parser.parse_dom(html, 'a', {'class': 'underilne'}, 'href') titles = dom_parser.parse_dom(html, 'a', {'class': 'underilne'}) items = zip(links, titles) else: url = urlparse.urljoin(self.base_url, '/search?=%s' % urllib.quote_plus(title)) data = {'q': title, 'go': 'Search'} html = self._http_get(url, data=data, cache_limit=8) match = re.search('you can search again in (\d+) seconds', html, re.I) if match: wait = int(match.group(1)) if wait > self.timeout: wait = self.timeout time.sleep(wait) html = self._http_get(url, data=data, cache_limit=0) pattern = 'class="movie_box.*?href="([^"]+).*?<h1>([^<]+)' items = re.findall(pattern, html, re.DOTALL) norm_title = scraper_utils.normalize_title(title) for item in items: url, match_title = item if norm_title in scraper_utils.normalize_title(match_title): result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/movies.php?list=search&search=') search_url += urllib.quote_plus(title) cookies = {'onlylanguage': 'en', 'lang': 'en'} html = self._http_get(search_url, cookies=cookies, cache_limit=.25) results = [] pattern = 'id="tdmovies">\s*<a\s+href="([^"]+)">([^<]+).*?id="f7">(.*?)</TD>' for match in re.finditer(pattern, html, re.DOTALL): url, title, extra = match.groups('') if (video_type == VIDEO_TYPES.MOVIE and '(TVshow)' in title) or (video_type == VIDEO_TYPES.TVSHOW and '(TVshow)' not in title): continue title = title.replace('(TVshow)', '') title = title.strip() r = re.search('>(\d{4})<', extra) if r: match_year = r.group(1) else: match_year = '' if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=""): results = [] search_url = urlparse.urljoin(self.base_url, "/search?query=%s") search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=8) for item in dom_parser.parse_dom(html, "div", {"class": "one_movie-item"}): match_url = dom_parser.parse_dom(item, "a", ret="href") match_title = dom_parser.parse_dom(item, "img", ret="alt") media_type = dom_parser.parse_dom(item, "div", {"class": "movie-series"}) if not media_type: media_type = VIDEO_TYPES.MOVIE elif media_type[0] == "TV SERIE": media_type = VIDEO_TYPES.TVSHOW if match_url and match_title and video_type == media_type: match_url = match_url[0] match_title = match_title[0] match_year = re.search("-(\d{4})-", match_url) if match_year: match_year = match_year.group(1) else: match_year = "" if not year or not match_year or year == match_year: result = { "url": scraper_utils.pathify_url(match_url), "title": scraper_utils.cleanse_title(match_title), "year": match_year, } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = base64.decodestring(SEARCH_URL) % (urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=2) if html: js_data = scraper_utils.parse_json(html) search_meta = scraper_utils.parse_episode_link(title) for item in js_data.get('results', []): metatags = item.get('richSnippet', {}).get('metatags', {}) post_date = metatags.get('articlePublishedTime') if post_date: post_date = re.sub('[+-]\d+:\d+$', '', post_date) post_date = scraper_utils.to_datetime(post_date, '%Y-%m-%dT%H:%M:%S').date() if self.__too_old(post_date): continue match_title = metatags.get('ogTitle', '') if not match_title: match_title = item['titleNoFormatting'] match_title = re.sub(re.compile('\s*-\s*Scene\s*Down$', re.I), '', match_title) match_url = item['url'] match_year = '' item_meta = scraper_utils.parse_episode_link(match_title) if scraper_utils.meta_release_check(video_type, search_meta, item_meta): result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) if not results: results = self.__site_search(video_type, title, year) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search_ajax') data = {'query': title} html = self._http_get(search_url, data=data, headers=XHR, cache_limit=1) results = [] for match in re.finditer( 'class="list-group-item"\s+href="([^"]+)">([^<]+)', html): url, match_title = match.groups() result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': '' } results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search?q=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=8) results = [] for item in dom_parser.parse_dom(html, 'h4', {'class': 'media-heading'}): match = re.search('href="([^"]+)">([^<]+)', item) if match: url, match_title = match.groups() result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': '' } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] html = self._http_get(self.base_url, params={'s': title}, cache_limit=8) for _attrs, item in dom_parser2.parse_dom(html, 'div', {'class': 'browse-movie-top'}): match = dom_parser2.parse_dom(item, 'a', req='href') if match: match_url, match_title_year = match[0].attrs['href'], match[0].content match_title, match_year = scraper_utils.extra_year(match_title_year) if not match_year: div = dom_parser2.parse_dom(item, 'div', {'class': 'browse-movie-year'}) if div: match_year = div[0].content.strip() match_url += '?watching' if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable search_url = scraper_utils.urljoin(self.base_url, '/search') html = self._http_get(search_url, params={'q': title}, cache_limit=8) results = [] for _attrs, item in dom_parser2.parse_dom(html, 'td', {'class': 'col-md-10'}): match = dom_parser2.parse_dom(item, 'a', req='href') if match: match_url, match_title = match[0].attrs['href'], match[ 0].content result = { 'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': '' } results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/?s=%s' % (urllib.quote_plus(title))) html = self._http_get(search_url, cache_limit=.25) for match in re.finditer('class="home_post_cont.*?href="([^"]+).*?/">(.*?)<', html, re.DOTALL): link, match_title_year = match.groups() match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(link), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/search/') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) for fragment in dom_parser.parse_dom(html, 'div', {'class': 'inner'}): name = dom_parser.parse_dom(fragment, 'div', {'class': 'name'}) if name: match = re.search('href="([^"]+)[^>]+>(.*?)</a>', name[0]) if match: match_url, match_title_year = match.groups() if 'tv-series' in match_url and video_type == VIDEO_TYPES.MOVIE: continue match_title_year = re.sub('</?[^>]*>', '', match_title_year) match_title_year = re.sub('[Ww]atch\s+[Mm]ovie\s*', '', match_title_year) match_title_year = match_title_year.replace('’', "'") match = re.search('(.*?)\s+\((\d{4})[^)]*\)$', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not match_year: year_span = dom_parser.parse_dom( fragment, 'span', {'class': 'year'}) if year_span: year_text = dom_parser.parse_dom(year_span[0], 'a') if year_text: match_year = year_text[0].strip() if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(match_title), 'url': scraper_utils.pathify_url(match_url), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] folders = ['/addons/real-movies/base.xml'] norm_title = scraper_utils.normalize_title(title) for page_url in folders: xml_file = os.path.basename(page_url) page_url = scraper_utils.urljoin(self.base_url, page_url) xml = self._http_get(page_url, require_debrid=False, cache_limit=48) new_folders = re.findall('<folder>(.*?)</folder>', xml, re.I) if new_folders: folders += [folder for folder in new_folders if folder] for match in re.finditer('<item>(.*?)</item>', xml, re.I | re.DOTALL): item = match.group(1) match_title_year = re.search('<title>(.*?)</title>', item, re.I) match_url = re.search('<link>(.*?)</link>', item, re.I) if match_title_year and match_url: match_title_year = match_title_year.group(1) match_url = match_url.group(1) if match_title_year and match_url: match_title, match_year = scraper_utils.extra_year( match_title_year) xml_file = xml_file.replace(' ', '').lower() match_url = 'xml_file=%s&link=%s' % (xml_file, match_url) if norm_title in scraper_utils.normalize_title( match_title) and (not year or not match_year or year == match_year): if 'format' in XML_META.get(xml_file, {}): match_title += ' (%s)' % ( XML_META[xml_file]['format']) result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': match_url } results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] url = urlparse.urljoin(self.base_url, '/forum/forum.php') html = self._http_get(url, cache_limit=48) norm_title = scraper_utils.normalize_title(title) for span in dom_parser.parse_dom(html, 'span', {'class': 'sectiontitle'}): match = re.search('href="([^"]+)[^>]+>([^<]+)', span) if match: url, match_title = match.groups() if norm_title in scraper_utils.normalize_title(match_title): result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': '' } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] html = self._http_get(self.base_url, cache_limit=48) fragment = dom_parser2.parse_dom(html, 'div', {'id': 'fil'}) if not fragment: return results norm_title = scraper_utils.normalize_title(title) for match in re.finditer('href="([^"]+)"\s+title="([^"]+)', fragment[0].content): url, match_title = match.groups() if norm_title in scraper_utils.normalize_title(match_title): result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': '' } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable show_list_url = scraper_utils.urljoin(self.base_url, '/tv-lists/') html = self._http_get(show_list_url, cache_limit=8) results = [] seen_urls = set() norm_title = scraper_utils.normalize_title(title) for _attrs, item in dom_parser2.parse_dom(html, 'li'): match = dom_parser2.parse_dom(item, 'a', req='href') if match: match_url = scraper_utils.pathify_url(match[0].attrs['href']) match_title = match[0].content if match_url in seen_urls: continue seen_urls.add(match_url) match_title = re.sub('</?strong[^>]*>', '', match_title) if norm_title in scraper_utils.normalize_title(match_title): result = {'url': match_url, 'title': scraper_utils.cleanse_title(match_title), 'year': ''} results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/search') params = {'type': 'movies', 'q': title} html = self._http_get(search_url, params=params, cache_limit=8) for _attrs, item in dom_parser2.parse_dom(html, 'div', {'id': re.compile('movie-\d+')}): is_tvshow = dom_parser2.parse_dom(item, 'div', {'class': 'movieTV'}) if (video_type == VIDEO_TYPES.MOVIE and is_tvshow) or (video_type == VIDEO_TYPES.TVSHOW and not is_tvshow): continue match_url = dom_parser2.parse_dom(item, 'a', req='href') match_title = dom_parser2.parse_dom(item, 'h4') if match_url and match_title: match_title = match_title[0].content match_url = match_url[0].attrs['href'] match_year = '' if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search/') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=0) results = [] match = re.search('ul class="list-film"(.*?)</ul>', html, re.DOTALL) if match: result_fragment = match.group(1) pattern = 'class="name">\s*<a\s+href="([^"]+)"\s+title="Watch\s+(.*?)\s+\((\d{4})\)' for match in re.finditer(pattern, result_fragment, re.DOTALL): url, title, match_year = match.groups('') if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': match_year } results.append(result) return results
def __list(self, title): results = [] search_url = scraper_utils.urljoin(self.base_url, 'index.php') params = {'do': 'charmap', 'name': 'series-list', 'args': '/' + title[0]} html = self._http_get(search_url, params=params, require_debrid=True, cache_limit=48) fragment = dom_parser2.parse_dom(html, 'div', {'class': 'downpara-list'}) if not fragment: return results for match in dom_parser2.parse_dom(fragment[0].content, 'a', req='href'): match_url = match.attrs['href'] match_title_extra = match.content match_title, match_season, q_str, is_pack = self.__get_title_parts(match_title_extra) if is_pack: continue quality = QUALITY_MAP.get(q_str, QUALITIES.HIGH) result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': '', 'quality': quality, 'season': match_season, 'q_str': q_str} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin( self.base_url, '/search/%s.html' % urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=.25) results = [] fragment = dom_parser.parse_dom(html, 'div', {'class': 'list-movie'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'div', {'class': 'movie'}): match = re.search( 'class="movie-name".*?href="([^"]+)[^>]+>([^<]+)', item) if match: url, match_title = match.groups() is_season = re.search('\s+-\s+[Ss](\d+)$', match_title) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: match_year = '' if video_type == VIDEO_TYPES.MOVIE: for info_frag in dom_parser.parse_dom( item, 'p', {'class': 'info'}): match = re.search('(\d{4})', info_frag) if match: match_year = match.group(1) break if not match_year: match = re.search('(\d{4})$', url) if match: match_year = match.group(1) else: if season and int( is_season.group(1)) != int(season): continue if (not year or not match_year or year == match_year): result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): html = self._http_get(self.base_url, cache_limit=8) results = [] fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*dizis[^"]*'}) norm_title = scraper_utils.normalize_title(title) if fragment: for match in re.finditer('href="([^"]+)[^>]*>([^<]+)', fragment[0]): url, match_title = match.groups() if norm_title in scraper_utils.normalize_title(match_title): result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': '' } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/ajax/search.php') timestamp = int(time.time() * 1000) query = {'q': title, 'limit': 100, 'timestamp': timestamp, 'verifiedCheck': ''} html = self._http_get(search_url, data=query, headers=XHR, cache_limit=1) if video_type in [VIDEO_TYPES.TVSHOW, VIDEO_TYPES.EPISODE]: media_type = 'TV SHOW' else: media_type = 'MOVIE' js_data = scraper_utils.parse_json(html, search_url) for item in js_data: if not item['meta'].upper().startswith(media_type): continue result = {'title': scraper_utils.cleanse_title(item['title']), 'url': scraper_utils.pathify_url(item['permalink']), 'year': ''} results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/search/') search_url = scraper_utils.urljoin(search_url, urllib.quote_plus(title)) html = self._http_get(search_url, require_debrid=False, cache_limit=8) for _attrs, fragment in dom_parser2.parse_dom(html, 'div', {'class': 'list'}): if not dom_parser2.parse_dom(fragment, 'div', {'class': 'lists_titles'}): continue for attrs, match_title_year in dom_parser2.parse_dom(fragment, 'a', {'class': 'title'}, req='href'): match_url = attrs['href'] match_title_year = re.sub('</?[^>]*>', '', match_title_year) is_show = re.search('\(d{4|-\)', match_title_year) if (is_show and video_type == VIDEO_TYPES.MOVIE) or (not is_show and video_type == VIDEO_TYPES.TVSHOW): continue match_title, match_year = scraper_utils.extra_year(match_title_year) if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] norm_title = scraper_utils.normalize_title(title) html = self._http_get(self.base_url, cache_limit=48) links = dom_parser.parse_dom(html, 'a', {'rollapp-href': '[^"]*'}, ret='href') titles = dom_parser.parse_dom(html, 'a', {'rollapp-href': '[^"]*'}) for match_url, match_title in zip(links, titles): if norm_title in scraper_utils.normalize_title(match_title): show_dir = self.__get_show_dir(match_url) if show_dir: result = { 'url': scraper_utils.pathify_url(show_dir), 'title': scraper_utils.cleanse_title(match_title), 'year': '' } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin( self.base_url, '/search-movies/%s.html' % (urllib.quote_plus(title))) html = self._http_get(search_url, cache_limit=8) for _attrs, item in dom_parser2.parse_dom(html, 'li', {'class': 'item'}): match_url = dom_parser2.parse_dom(item, 'a', req='href') match_title_year = re.search('onmouseover="([^"]+)', item) if match_url and match_title_year: match_url = match_url[0].attrs['href'] match_title_year = match_title_year.group(1) match = re.search('<b>(?:<i>)?\s*(.*?)\s*(?:</i>)?</b>', match_title_year) if not match: continue match_title, match_year = scraper_utils.extra_year( match.group(1)) is_season = re.search('season\s+(\d+)', match_title_year, re.I) if (is_season and video_type == VIDEO_TYPES.MOVIE) or ( not is_season and video_type == VIDEO_TYPES.SEASON): continue if video_type == VIDEO_TYPES.MOVIE: if not match_year: match_year = re.search('>Release:\s*(\d{4})', match_title_year) match_year = match_year.group(1) if match_year else '' else: if season and int(season) != int(is_season.group(1)): continue if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url) } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/search/') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) for _attrs, fragment in dom_parser2.parse_dom(html, 'div', {'class': 'inner'}): name = dom_parser2.parse_dom(fragment, 'div', {'class': 'name'}) if not name: continue match = dom_parser2.parse_dom(name[0].content, 'a', req='href') if not match: continue match_url, match_title_year = match[0].attrs['href'], match[ 0].content if 'tv-series' in match_url and video_type == VIDEO_TYPES.MOVIE: continue match_title_year = re.sub('</?[^>]*>', '', match_title_year) match_title_year = re.sub('[Ww]atch\s+[Mm]ovie\s*', '', match_title_year) match_title_year = match_title_year.replace('’', "'") match_title, match_year = scraper_utils.extra_year( match_title_year) if not match_year: year_span = dom_parser2.parse_dom(fragment, 'span', {'class': 'year'}) if year_span: year_text = dom_parser2.parse_dom(year_span[0].content, 'a') if year_text: match_year = year_text[0].content.strip() if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(match_title), 'url': scraper_utils.pathify_url(match_url), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] test_url = title.replace("'", '') test_url = re.sub(r'[^a-zA-Z0-9\s]+', ' ', test_url).lower().strip() test_url = re.sub('\s+', ' ', test_url) test_url = test_url.replace(' ', '-') if year: test_url += '-%s' % (year) test_url += '/' test_url = urlparse.urljoin(self.base_url, test_url) if self._http_get(test_url, cache_limit=1): result = { 'title': scraper_utils.cleanse_title(title), 'year': year, 'url': scraper_utils.pathify_url(test_url) } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] html = self._http_get(self.base_url, params={'s': title}, cache_limit=8) for _attrs, item in dom_parser2.parse_dom(html, 'div', {'class': 'cover'}): match = dom_parser2.parse_dom(item, 'a', req=['href', 'title']) if not match: continue match_url, match_title_year = match[0].attrs['href'], match[0].attrs['title'] if re.search('S\d+E\d+', match_title_year, re.I): continue match_title, match_year = scraper_utils.extra_year(match_title_year) if not match_year: match = re.search('-(\d{4})-', match_url) if match: match_year = match.group(1) if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def __movie_search(self, title, year): results = [] norm_title = scraper_utils.normalize_title(title) html = self._http_get(self.base_url, cache_limit=48) for item in self._parse_directory(html): if not item['directory']: match_title, match_year, height, extra = scraper_utils.parse_movie_link( item['title']) if 'dubbed' in extra.lower(): continue if (norm_title in scraper_utils.normalize_title(match_title) ) and (not year or not match_year or year == match_year): match_title = match_title.replace('.', ' ') match_title += ' [%sp.%s]' % (height, extra) result = { 'url': scraper_utils.pathify_url(item['link']), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable html = self._http_get(self.base_url, params={'s': title}, cache_limit=1) results = [] for _attrs, item in dom_parser2.parse_dom(html, 'div', {'class': 'item'}): match_url = dom_parser2.parse_dom(item, 'a', req='href') match_title = dom_parser2.parse_dom(item, 'span', {'class': 'tt'}) year_frag = dom_parser2.parse_dom(item, 'span', {'class': 'year'}) if match_url and match_title: match_url = match_url[0].attrs['href'] match_title = match_title[0].content if re.search('\d+\s*x\s*\d+', match_title): continue # exclude episodes match_title, match_year = scraper_utils.extra_year(match_title) if not match_year and year_frag: match_year = year_frag[0].content if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] page_url = scraper_utils.urljoin(self.base_url, '/tvseries/search.php') html = self._http_get(page_url, params={'dayq': title}, cache_limit=48) html = re.sub('<!--.*?-->', '', html) norm_title = scraper_utils.normalize_title(title) for _attrs, td in dom_parser2.parse_dom(html, 'td', {'class': 'topic_content'}): match_url = dom_parser2.parse_dom(td, 'a', req='href') match_title_year = dom_parser2.parse_dom(td, 'img', req='alt') if not match_url or not match_title_year: continue match_url = match_url[0].attrs['href'] match_title_year = match_title_year[0].attrs['alt'] if not match_url.startswith('/'): match_url = '/tvseries/' + match_url match_title, match_year = scraper_utils.extra_year(match_title_year) if (norm_title in scraper_utils.normalize_title(match_title)) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def _get_episode_url(self, show_url, video): episode_pattern = 'href="([^"]+season-0*%s-episode-0*%s-[^"]*)' % ( video.season, video.episode) page_url = show_url pages = 0 while page_url and pages < MAX_PAGES: page_url = scraper_utils.urljoin(self.base_url, page_url) html = self._http_get(page_url, cache_limit=2) ep_url = self._default_get_episode_url(html, video, episode_pattern) if ep_url: return ep_url fragment = dom_parser2.parse_dom(html, 'div', {'class': 'pagination'}) if not fragment: break match = re.search('href="([^"]+)[^>]+>\s*>\s*<', fragment[0].content) if not match: break page_url = scraper_utils.cleanse_title(match.group(1)) pages += 1
def __search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/search/') search_url += urllib.quote(title) html = self._http_get(search_url, cache_limit=2) for item in dom_parser.parse_dom(html, 'div', {'class': 'name_top'}): match = re.search('href="([^"]+)[^>]+>([^<]+)', item) if match: match_url, match_title_year = match.groups() match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = '/Category-FilmsAndTV/Genre-Any/Letter-Any/ByPopularity/1/Search-%s.htm' % (title) search_url = urlparse.urljoin(self.base_url, search_url) html = self._http_get(search_url, cache_limit=.25) results = [] for result in dom_parser.parse_dom(html, 'div', {'class': 'searchResult'}): url = dom_parser.parse_dom(result, 'a', {'itemprop': 'url'}, ret='href') match_title = dom_parser.parse_dom(result, 'span', {'itemprop': 'name'}) match_year = dom_parser.parse_dom(result, 'span', {'itemprop': 'copyrightYear'}) if match_year: match_year = match_year[0] else: match_year = '' if url and match_title and (not year or not match_year or year == match_year): if FRAGMENTS[video_type] not in url[0].lower(): continue result = {'url': scraper_utils.pathify_url(url[0]), 'title': scraper_utils.cleanse_title(match_title[0]), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin( self.base_url, '/search.php?q=%s&limit=20×tamp=%s' % (urllib.quote_plus(title), int(time.time()))) html = self._http_get(search_url, cache_limit=.25) results = [] items = dom_parser.parse_dom(html, 'li') if len(items) >= 2: items = items[1:] for item in items: match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title_year = dom_parser.parse_dom(item, 'strong') if match_url and match_title_year: match_url = match_url[0] match_title_year = re.sub('</?strong>', '', match_title_year[0]) is_season = re.search('S(?:eason\s+)?(\d+)$', match_title_year, re.I) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: if video_type == VIDEO_TYPES.MOVIE: match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' else: if season and int( is_season.group(1)) != int(season): continue match_title = match_title_year match_year = '' result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url) } results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/movies-list.php?b=search&v=%s') search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, headers=XHR, cache_limit=0) results = [] for movie in dom_parser.parse_dom(html, 'li', {'class': '[^"]*movie[^"]*'}): href = dom_parser.parse_dom(movie, 'a', ret='href') title = dom_parser.parse_dom(movie, 'h4') if href and title: match = re.search('movie-detail/(.*?)/', href[0]) if match: result = { 'url': DETAIL_URL % (match.group(1)), 'title': scraper_utils.cleanse_title(title[0]), 'year': '' } results.append(result) return results