def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: query = urlparse.urljoin(self.base_link, self.tvsearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0])) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = ['%s' % str(year), '%s' % str(int(year) + 1), '%s' % str(int(year) - 1)] items = html.findAll('div', attrs={'class': 'item'}) show_url = None for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title(link_title) and show_year in link_title: url = re.findall('(?://.+?|)(/.+)', href)[0] show_url = urlparse.urljoin(self.base_link, replaceHTMLCodes(url)) else: continue html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item')) season_items = html.findAll('div', attrs={'class': 'show_season'}) for season_item in season_items: if season_item["data-id"] != season: continue episode_items = season_item.findAll('div', attrs={'class': 'tv_episode_item'}) for episode_item in episode_items: link = episode_item.findAll('a')[-1] href = link["href"] link_episode = link.contents[0].strip() if link_episode != "E%s" % (episode): continue link_airdate = link.findAll('span', attrs={'class': 'tv_num_versions'})[-1] # WTF link_airdate = link_airdate.contents[0] if any(candidate_year in link_airdate for candidate_year in years): return self.sources(href) except: pass return []
def get_html(self, title, search_link): key = self.get_key() query = search_link % (urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0]), key) query = urlparse.urljoin(self.base_link, query) html = proxy.get(query, ('index_item')) if 'index_item' in html: if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'index_item') html += html2 return html
def get_html(self, title, search_link): key = self.get_key() query = search_link % ( urllib.quote_plus(" ".join(title.translate(None, '\'"?:!@#$&-,').split()).rsplit(':', 1)[0]), key) query = urlparse.urljoin(self.base_link, query) html = proxy.get(query, ('index_item')) if 'index_item' in html: if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'index_item') html += html2 return html
def get_html(self, title, search_link): key = self.get_key() query = search_link % (urllib.quote_plus( title.replace('\'', '').rsplit(':', 1)[0]), key) query = urlparse.urljoin(self.base_link, query) html = proxy.get(query, ('index_item')) if 'index_item' in html: if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'index_item') html += html2 return html
def scrape_movie(self, title, year, imdb): try: query = self.moviesearch_link % urllib.quote_plus( title.replace('\'', '').rsplit(':', 1)[0]) query = urlparse.urljoin(self.base_link, query) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = [ '(%s)' % str(year), '(%s)' % str(int(year) + 1), '(%s)' % str(int(year) - 1) ] items = html.findAll('div', attrs={'class': 'item'}) for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] if any(candidate_year in link_title for candidate_year in years): try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title(link_title): url = re.findall('(?://.+?|)(/.+)', href)[0] url = replaceHTMLCodes(url) return self.sources(url) except: pass return []
def sources(self, url): sources = [] try: if url == None: return sources url = urlparse.urljoin(self.base_link, url) html = proxy.get(url, 'choose_tabs') parsed_html = BeautifulSoup(html) table_bodies = parsed_html.findAll('tbody') count = 0 for table_body in table_bodies: try: link = table_body.findAll('a')[0]["href"] try: link = urlparse.parse_qs(urlparse.urlparse(link).query)['u'][ 0] # replace link with ?u= part if present except: pass try: link = urlparse.parse_qs(urlparse.urlparse(link).query)['q'][ 0] # replace link with ?q= part if present except: pass link = urlparse.parse_qs(urlparse.urlparse(link).query)['url'][ 0] # replace link with ?url= part if present link = base64.b64decode(link) # decode base 64 if link.startswith("//"): link = "http:" + link link = replaceHTMLCodes(link) link = link.encode('utf-8') host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(link.strip().lower()).netloc)[0] host = replaceHTMLCodes(host) host = host.encode('utf-8') quality = table_body.findAll('span')[0]["class"] if quality == 'quality_cam' or quality == 'quality_ts': quality = 'CAM' elif quality == 'quality_dvd': quality = 'SD' if not filter_host(host): continue count +=1 sources.append( {'source': host, 'quality': quality, 'scraper': 'Primewire', 'url': link, 'direct': False}) except: pass if dev_log=='true': end_time = time.time() - self.start_time send_log(self.name,end_time,count) return sources except: return sources
def sources(self, url): sources = [] try: if url == None: return sources absolute_url = urlparse.urljoin(self.base_link, url) html = BeautifulSoup(proxy.get(absolute_url, 'link_ite')) tables = html.findAll('table', attrs={'class': re.compile('link_ite.+?')}) for table in tables: rows = table.findAll('tr') for row in rows: link = row.findAll('a')[-1] href = link['href'] if not 'gtfo' in href: continue try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['q'][0] except: pass href = base64.b64decode( urlparse.parse_qs( urlparse.urlparse(href).query)['gtfo'][0]) href = replaceHTMLCodes(href) host = re.findall( '([\w]+[.][\w]+)$', urlparse.urlparse(href.strip().lower()).netloc)[0] host = replaceHTMLCodes(host) host = host.encode('utf-8') quality = row.findAll('div', attrs={'class': 'quality'})[0].text if "CAM" in quality or 'TS' in quality: quality = 'CAM' if 'HD' in quality: pass else: quality = 'SD' sources.append({ 'source': host, 'quality': quality, 'scraper': self.name, 'url': href, 'direct': False }) except: pass return sources
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb, debrid = False): try: html = BeautifulSoup(self.get_html(title, self.tvsearch_link)) index_items = html.findAll('div', attrs={'class': re.compile('index_item.+?')}) title = 'watch' + clean_title(" ".join(title.translate(None, '\'"?:!@#$&-,'))) for index_item in index_items: try: links = index_item.findAll('a') for link in links: href = link['href'] link_title = link['title'] try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass clean_link_title = clean_title(" ".join(link_title.encode().translate(None, '\'"?:!@#$&-,'))) if title == clean_link_title: # href is the show page relative url show_url = urlparse.urljoin(self.base_link, href) html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item')) seasons = html.findAll('div', attrs={'class': 'show_season'}) for scraped_season in seasons: if scraped_season['data-id'] == season: tv_episode_items = scraped_season.findAll('div', attrs={'class': 'tv_episode_item'}) for tv_episode_item in tv_episode_items: links = tv_episode_item.findAll('a') for link in links: if link.contents[0].strip() == "E%s" % episode: episode_href = link['href'] try: episode_href = \ urlparse.parse_qs(urlparse.urlparse(episode_href).query)['u'][0] except: pass try: episode_href = \ urlparse.parse_qs(urlparse.urlparse(episode_href).query)['q'][0] except: pass return self.sources(episode_href) except: continue return [] except Exception, argument: if dev_log == 'true': error_log(self.name,'Check Search') return []
def scrape_movie(self, title, year, imdb): try: query = self.moviesearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0]) query = urlparse.urljoin(self.base_link, query) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = ['(%s)' % str(year), '(%s)' % str(int(year) + 1), '(%s)' % str(int(year) - 1)] items = html.findAll('div', attrs={'class': 'item'}) for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] if any(candidate_year in link_title for candidate_year in years): try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title(link_title): url = re.findall('(?://.+?|)(/.+)', href)[0] url = replaceHTMLCodes(url) return self.sources(url) except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: html = BeautifulSoup(self.get_html(title, self.tvsearch_link)) index_items = html.findAll('div', attrs={'class': re.compile('index_item.+?')}) title = 'watch' + clean_title(title).replace(": ", "") for index_item in index_items: try: links = index_item.findAll('a') for link in links: href = link['href'] link_title = link['title'] try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass if title == clean_title(link_title): # href is the show page relative url show_url = urlparse.urljoin(self.base_link, href) html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item')) seasons = html.findAll('div', attrs={'class': 'show_season'}) for scraped_season in seasons: if scraped_season['data-id'] == season: tv_episode_items = scraped_season.findAll('div', attrs={'class': 'tv_episode_item'}) for tv_episode_item in tv_episode_items: links = tv_episode_item.findAll('a') for link in links: if link.contents[0].strip() == "E%s" % episode: episode_href = link['href'] try: episode_href = \ urlparse.parse_qs(urlparse.urlparse(episode_href).query)['u'][0] except: pass try: episode_href = \ urlparse.parse_qs(urlparse.urlparse(episode_href).query)['q'][0] except: pass return self.sources(episode_href) except: continue except: pass return []
def sources(self, url): sources = [] try: if url == None: return sources absolute_url = urlparse.urljoin(self.base_link, url) html = BeautifulSoup(proxy.get(absolute_url, 'link_ite')) tables = html.findAll('table', attrs={'class': re.compile('link_ite.+?')}) for table in tables: rows = table.findAll('tr') for row in rows: link = row.findAll('a')[-1] href = link['href'] if not 'gtfo' in href: continue try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass href = base64.b64decode(urlparse.parse_qs(urlparse.urlparse(href).query)['gtfo'][0]) href = replaceHTMLCodes(href) host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(href.strip().lower()).netloc)[0] host = replaceHTMLCodes(host) host = host.encode('utf-8') if "qertewrt" in host: continue quality = row.findAll('div', attrs={'class': 'quality'})[0].text if "CAM" in quality or 'TS' in quality: quality = 'CAM' if 'HD' in quality: pass else: quality = 'SD' sources.append( {'source': host, 'quality': quality, 'scraper': self.name, 'url': href, 'direct': False}) except: pass return sources
def get_key(self): url = self.search_link html = proxy.get(url, 'searchform') parsed_html = BeautifulSoup(html) key = parsed_html.findAll('input', attrs={'name': 'key'})[0]["value"] return key
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: html = BeautifulSoup(self.get_html(title, self.tvsearch_link)) index_items = html.findAll( 'div', attrs={'class': re.compile('index_item.+?')}) title = 'watch' + clean_title(title).replace(": ", "") for index_item in index_items: try: links = index_item.findAll('a') for link in links: href = link['href'] link_title = link['title'] try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['q'][0] except: pass if title == clean_title( link_title ): # href is the show page relative url show_url = urlparse.urljoin(self.base_link, href) html = BeautifulSoup( proxy.get(show_url, 'tv_episode_item')) seasons = html.findAll( 'div', attrs={'class': 'show_season'}) for scraped_season in seasons: if scraped_season['data-id'] == season: tv_episode_items = scraped_season.findAll( 'div', attrs={'class': 'tv_episode_item'}) for tv_episode_item in tv_episode_items: links = tv_episode_item.findAll('a') for link in links: if link.contents[0].strip( ) == "E%s" % episode: episode_href = link['href'] try: episode_href = \ urlparse.parse_qs(urlparse.urlparse(episode_href).query)['u'][0] except: pass try: episode_href = \ urlparse.parse_qs(urlparse.urlparse(episode_href).query)['q'][0] except: pass return self.sources( episode_href) except: continue except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: query = urlparse.urljoin( self.base_link, self.tvsearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0])) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = [ '%s' % str(year), '%s' % str(int(year) + 1), '%s' % str(int(year) - 1) ] items = html.findAll('div', attrs={'class': 'item'}) show_url = None for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title( link_title) and show_year in link_title: url = re.findall('(?://.+?|)(/.+)', href)[0] show_url = urlparse.urljoin(self.base_link, replaceHTMLCodes(url)) else: continue html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item')) season_items = html.findAll('div', attrs={'class': 'show_season'}) for season_item in season_items: if season_item["data-id"] != season: continue episode_items = season_item.findAll( 'div', attrs={'class': 'tv_episode_item'}) for episode_item in episode_items: link = episode_item.findAll('a')[-1] href = link["href"] link_episode = link.contents[0].strip() if link_episode != "E%s" % (episode): continue link_airdate = link.findAll( 'span', attrs={'class': 'tv_num_versions'})[-1] # WTF link_airdate = link_airdate.contents[0] if any(candidate_year in link_airdate for candidate_year in years): return self.sources(href) except: pass return []