def sources(self, url): sources = [] try: if url == None: return sources absolute_url = urlparse.urljoin(self.base_link, url) html = BeautifulSoup(proxy.get(absolute_url, 'link_ite')) tables = html.findAll('table', attrs={'class': re.compile('link_ite.+?')}) for table in tables: rows = table.findAll('tr') for row in rows: link = row.findAll('a')[-1] href = link['href'] if not 'gtfo' in href: continue try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['q'][0] except: pass href = base64.b64decode( urlparse.parse_qs( urlparse.urlparse(href).query)['gtfo'][0]) href = replaceHTMLCodes(href) host = re.findall( '([\w]+[.][\w]+)$', urlparse.urlparse(href.strip().lower()).netloc)[0] host = replaceHTMLCodes(host) host = host.encode('utf-8') quality = row.findAll('div', attrs={'class': 'quality'})[0].text if "CAM" in quality or 'TS' in quality: quality = 'CAM' if 'HD' in quality: pass else: quality = 'SD' sources.append({ 'source': host, 'quality': quality, 'scraper': self.name, 'url': href, 'direct': False }) except: pass return sources
def sources(self, url): sources = [] try: if url == None: return sources url = urlparse.urljoin(self.base_link, url) html = proxy.get(url, 'choose_tabs') parsed_html = BeautifulSoup(html) table_bodies = parsed_html.findAll('tbody') for table_body in table_bodies: link = table_body.findAll('a')[0]["href"] try: link = urlparse.parse_qs( urlparse.urlparse(link).query)['u'][ 0] # replace link with ?u= part if present except: pass try: link = urlparse.parse_qs( urlparse.urlparse(link).query)['q'][ 0] # replace link with ?q= part if present except: pass link = urlparse.parse_qs(urlparse.urlparse(link).query)['url'][ 0] # replace link with ?url= part if present link = base64.b64decode(link) # decode base 64 if link.startswith("//"): link = "http:" + link link = replaceHTMLCodes(link) link = link.encode('utf-8') host = re.findall( '([\w]+[.][\w]+)$', urlparse.urlparse(link.strip().lower()).netloc)[0] host = replaceHTMLCodes(host) host = host.encode('utf-8') quality = table_body.findAll('span')[0]["class"] if quality == 'quality_cam' or quality == 'quality_ts': quality = 'CAM' elif quality == 'quality_dvd': quality = 'SD' sources.append({ 'source': host, 'quality': quality, 'scraper': 'Primewire', 'url': link, 'direct': False }) return sources except: return sources
def sources(self, url): sources = [] try: if url == None: return sources url = urlparse.urljoin(self.base_link, url) html = proxy.get(url, 'choose_tabs') parsed_html = BeautifulSoup(html) table_bodies = parsed_html.findAll('tbody') for table_body in table_bodies: link = table_body.findAll('a')[0]["href"] try: link = urlparse.parse_qs(urlparse.urlparse(link).query)['u'][ 0] # replace link with ?u= part if present except: pass try: link = urlparse.parse_qs(urlparse.urlparse(link).query)['q'][ 0] # replace link with ?q= part if present except: pass link = urlparse.parse_qs(urlparse.urlparse(link).query)['url'][ 0] # replace link with ?url= part if present link = base64.b64decode(link) # decode base 64 if link.startswith("//"): link = "http:" + link link = replaceHTMLCodes(link) link = link.encode('utf-8') host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(link.strip().lower()).netloc)[0] host = replaceHTMLCodes(host) host = host.encode('utf-8') quality = table_body.findAll('span')[0]["class"] if quality == 'quality_cam' or quality == 'quality_ts': quality = 'CAM' elif quality == 'quality_dvd': quality = 'SD' if "qertewrt" in host: continue sources.append( {'source': host, 'quality': quality, 'scraper': 'Primewire', 'url': link, 'direct': False}) return sources except: return sources
def sources(self, url): sources = [] try: if url == None: return sources absolute_url = urlparse.urljoin(self.base_link, url) html = BeautifulSoup(proxy.get(absolute_url, 'link_ite')) tables = html.findAll('table', attrs={'class': re.compile('link_ite.+?')}) for table in tables: rows = table.findAll('tr') for row in rows: link = row.findAll('a')[-1] href = link['href'] if not 'gtfo' in href: continue try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass href = base64.b64decode(urlparse.parse_qs(urlparse.urlparse(href).query)['gtfo'][0]) href = replaceHTMLCodes(href) host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(href.strip().lower()).netloc)[0] host = replaceHTMLCodes(host) host = host.encode('utf-8') if "qertewrt" in host: continue quality = row.findAll('div', attrs={'class': 'quality'})[0].text if "CAM" in quality or 'TS' in quality: quality = 'CAM' if 'HD' in quality: pass else: quality = 'SD' sources.append( {'source': host, 'quality': quality, 'scraper': self.name, 'url': href, 'direct': False}) except: pass return sources
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: headers = {'User-Agent': random_agent()} query = "%s+season+%s" % (urllib.quote_plus(title), season) query = self.search_link % query query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) checkseason = cleaned_title + "season" + season # print("ONEMOVIES", query,checkseason) html = BeautifulSoup( requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'ml-item'}) for result in containers: links = result.findAll('a') # print("ONEMOVIES", links) for link in links: link_title = str(link['title']) href = str(link['href']) # print("ONEMOVIES", link_title, href, info) if clean_title(link_title) == checkseason: ep_id = '?episode=%01d' % int(episode) href = href + ep_id # print("ONEMOVIES Passed", href) return self.sources(replaceHTMLCodes(href)) except: pass return []
def scrape_movie(self, title, year, imdb): try: # print("ONEMOVIES") headers = {'User-Agent': random_agent()} # print("ONEMOVIES", headers) query = self.search_link % (urllib.quote_plus( title.replace("'", " "))) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) # print("ONEMOVIES", query) html = BeautifulSoup( requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'ml-item'}) for result in containers: links = result.findAll('a') # print("ONEMOVIES", links) for link in links: link_title = str(link['title']) href = str(link['href']) info = str(link['data-url']) # print("ONEMOVIES", link_title, href, info) if clean_title(link_title) == cleaned_title: html = requests.get(info, headers=headers).content pattern = '<div class="jt-info">%s</div>' % year match = re.findall(pattern, html) if match: # print("ONEMOVIES MATCH", href) return self.sources(replaceHTMLCodes(href)) except: pass return []
def scrape_movie(self, title, year, imdb): try: print("MOVIEXK") headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title) + "+" + str(year)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'inner'}) for container in containers: print("MOVIEXK r1", container) movie_link = container.findAll('a')[0] r_href = movie_link['href'] print("MOVIEXK r2", r_href) r_title = movie_link['title'] link_year = container.findAll('span', attrs={'class': 'year'})[0].findAll('a')[0].text print("MOVIEXK r3", r_title) print("MOVIEXK RESULTS", r_title, r_href) if str(year) == link_year: if cleaned_title in clean_title(r_title): redirect = requests.get(r_href, headers=headers, timeout=30).text r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0] r_url = r_url.encode('utf-8') print("MOVIEXK PLAY URL", r_url) return self.sources(replaceHTMLCodes(r_url)) except: pass return []
def process_results_page(self, html, title, artist, referer): sources = [] result = html.find("div", "result") for item in result.findAll("div", "item"): title_block = item.find("div", "title") link = title_block.find("a") link_href = link["href"] spans = link.findAll("span") link_artist = spans[0].text link_title = replaceHTMLCodes(spans[1].text) if not clean_title(link_title) == clean_title(title): continue if not clean_title(artist) == clean_title(link_artist): continue headers2 = headers headers2["referer"] = referer html = BS(session.get(link_href, headers=headers2).content) tab_content = html.find("div", "tab-content") music_links = tab_content.findAll("a", "red-link") for music_link in music_links: sources.append({ 'source': 'mp3', 'quality': 'HD', 'scraper': self.name, 'url': music_link["href"], 'direct': True }) return sources
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: headers = {'User-Agent': random_agent()} query = "%s+season+%s" % (urllib.quote_plus(title), season) query = self.search_link % query query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) checkseason = cleaned_title + "season" + season # print("ONEMOVIES", query,checkseason) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'ml-item'}) for result in containers: links = result.findAll('a') # print("ONEMOVIES", links) for link in links: link_title = str(link['title']) href = str(link['href']) # print("ONEMOVIES", link_title, href, info) if clean_title(link_title) == checkseason: ep_id = '?episode=%01d' % int(episode) href = href + ep_id # print("ONEMOVIES Passed", href) return self.sources(replaceHTMLCodes(href)) except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): url_title = title.replace(' ', '-').replace('.', '-').replace( ":", "").replace("!", "").replace("?", "").lower() episode_url = '/%s/%01d-sezon-%01d-bolum.html' % ( url_title, int(season), int(episode)) return self.sources(replaceHTMLCodes(episode_url))
def sources(self, url): sources = [] try: if url == None: return sources headers = {'User-Agent': random_agent()} html = BeautifulSoup( requests.get(url, headers=headers, timeout=30).content) r = html.findAll('div', attrs={'class': 'site'}) for container in r: r_url = container.findAll('a')[0]['data-actuallink'].encode( 'utf-8') host = re.findall( '([\w]+[.][\w]+)$', urlparse.urlparse(r_url.strip().lower()).netloc)[0] host = replaceHTMLCodes(host) host = host.encode('utf-8') sources.append({ 'source': host, 'quality': 'SD', 'scraper': self.name, 'url': r_url, 'direct': False }) except: pass return sources
def scrape_movie(self, title, year, imdb): try: # print("ONEMOVIES") headers = {'User-Agent': random_agent()} # print("ONEMOVIES", headers) query = self.search_link % (urllib.quote_plus(title.replace("'", " "))) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) # print("ONEMOVIES", query) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'ml-item'}) for result in containers: links = result.findAll('a') # print("ONEMOVIES", links) for link in links: link_title = str(link['title']) href = str(link['href']) info = str(link['data-url']) # print("ONEMOVIES", link_title, href, info) if clean_title(link_title) == cleaned_title: html = requests.get(info, headers=headers).content pattern = '<div class="jt-info">%s</div>' % year match = re.findall(pattern, html) if match: # print("ONEMOVIES MATCH", href) return self.sources(replaceHTMLCodes(href)) except: pass return []
def scrape_movie(self, title, year, imdb): try: headers = {'User-Agent': random_agent()} query = urlparse.urljoin(self.base_link, self.search_link) query = query % urllib.quote_plus(title) # print ("XMOVIES query", query) cleaned_title = clean_title(title) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'item_movie'}) # print ("XMOVIES r1", containers) for container in containers: try: links = container.findAll('h2', attrs={'class': 'tit'})[0] r = links.findAll('a') for link in r: link_title = link['title'].encode('utf-8') href = link['href'].encode('utf-8') if len(link_title) > 0 and len(href) > 0: parsed = re.findall('(.+?) \((\d{4})', link_title) parsed_title = parsed[0][0] parsed_years = parsed[0][1] if cleaned_title.lower() == clean_title(parsed_title).lower() and year == parsed_years: if not "http:" in href: href = "http:" + href return self.sources(replaceHTMLCodes(href)) except: pass except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) ep_id = int(episode) season_id = int(season) html = requests.get(query, headers=headers, timeout=30).json() results = html['series'] for item in results: r_title = item['label'].encode('utf-8') r_link = item['seo'].encode('utf-8') if cleaned_title == clean_title(r_title): r_page = self.base_link + "/" + r_link # print("WATCHEPISODES r1", r_title,r_page) r_html = BeautifulSoup( requests.get(r_page, headers=headers, timeout=30).content) r = r_html.findAll( 'div', attrs={'class': re.compile('\s*el-item\s*')}) for container in r: try: r_href = container.findAll('a')[0]['href'].encode( 'utf-8') r_title = container.findAll( 'a')[0]['title'].encode('utf-8') # print("WATCHEPISODES r3", r_href,r_title) episode_check = "[sS]%02d[eE]%02d" % (int(season), int(episode)) match = re.search(episode_check, r_title) if match: # print("WATCHEPISODES PASSED EPISODE", r_href) return self.sources(replaceHTMLCodes(r_href)) else: match2 = re.search(episode_check, r_href) if match2: # print("WATCHEPISODES PASSED EPISODE", r_href) return self.sources( replaceHTMLCodes(r_href)) except: pass except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: query = urlparse.urljoin(self.base_link, self.tvsearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0])) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = ['%s' % str(year), '%s' % str(int(year) + 1), '%s' % str(int(year) - 1)] items = html.findAll('div', attrs={'class': 'item'}) show_url = None for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title(link_title) and show_year in link_title: url = re.findall('(?://.+?|)(/.+)', href)[0] show_url = urlparse.urljoin(self.base_link, replaceHTMLCodes(url)) else: continue html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item')) season_items = html.findAll('div', attrs={'class': 'show_season'}) for season_item in season_items: if season_item["data-id"] != season: continue episode_items = season_item.findAll('div', attrs={'class': 'tv_episode_item'}) for episode_item in episode_items: link = episode_item.findAll('a')[-1] href = link["href"] link_episode = link.contents[0].strip() if link_episode != "E%s" % (episode): continue link_airdate = link.findAll('span', attrs={'class': 'tv_num_versions'})[-1] # WTF link_airdate = link_airdate.contents[0] if any(candidate_year in link_airdate for candidate_year in years): return self.sources(href) except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: print("MOVIEXK") headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) ep_id = int(episode) season_id = int(season) html = BeautifulSoup( requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'inner'}) for container in containers: print("MOVIEXK r1", container) show_link = container.findAll('a')[0] r_href = show_link['href'] print("MOVIEXK r2", r_href) r_title = show_link['title'] print("MOVIEXK r3", r_title) print("MOVIEXK r4", r_title, r_href) if cleaned_title in clean_title( r_title) and "tv" in r_title.lower(): redirect = requests.get(r_href, headers=headers, timeout=30).text r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0] r_url = r_url.encode('utf-8') links = BeautifulSoup( requests.get(r_url, headers=headers, timeout=30).content) ep_items = links.findAll('ul', attrs={'class': 'episodelist'}) for items in ep_items: ep_links = items.findAll('a') for r in ep_links: print("MOVIEXK r5", r) ep_url = r['href'].encode('utf-8') ep_title = r['title'].encode('utf-8') print("MOVIEXK r6", ep_url, ep_title) clean_ep_title = clean_title(ep_title) if "s%02de%02d" % ( season_id, ep_id) in clean_ep_title or "s%02d%02d" % ( season_id, ep_id ) in clean_ep_title or "s%02d%d" % ( season_id, ep_id ) in clean_ep_title or "epse%d%d" % ( season_id, ep_id) in clean_ep_title: return self.sources(replaceHTMLCodes(ep_url)) except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) ep_id = int(episode) season_id = int(season) html = requests.get(query, headers=headers, timeout=30).json() results = html['series'] for item in results: r_title = item['label'].encode('utf-8') r_link = item['seo'].encode('utf-8') if cleaned_title == clean_title(r_title): r_page = self.base_link + "/" + r_link # print("WATCHEPISODES r1", r_title,r_page) r_html = BeautifulSoup(requests.get(r_page, headers=headers, timeout=30).content) r = r_html.findAll('div', attrs={'class': re.compile('\s*el-item\s*')}) for container in r: try: r_href = container.findAll('a')[0]['href'].encode('utf-8') r_title = container.findAll('a')[0]['title'].encode('utf-8') # print("WATCHEPISODES r3", r_href,r_title) episode_check = "[sS]%02d[eE]%02d" % (int(season), int(episode)) match = re.search(episode_check, r_title) if match: # print("WATCHEPISODES PASSED EPISODE", r_href) return self.sources(replaceHTMLCodes(r_href)) else: match2 = re.search(episode_check, r_href) if match2: # print("WATCHEPISODES PASSED EPISODE", r_href) return self.sources(replaceHTMLCodes(r_href)) except: pass except: pass return []
def scrape_movie(self, title, year, imdb): try: query = self.moviesearch_link % urllib.quote_plus( title.replace('\'', '').rsplit(':', 1)[0]) query = urlparse.urljoin(self.base_link, query) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = [ '(%s)' % str(year), '(%s)' % str(int(year) + 1), '(%s)' % str(int(year) - 1) ] items = html.findAll('div', attrs={'class': 'item'}) for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] if any(candidate_year in link_title for candidate_year in years): try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title(link_title): url = re.findall('(?://.+?|)(/.+)', href)[0] url = replaceHTMLCodes(url) return self.sources(url) except: pass return []
def sources(self, url): sources = [] try: if url == None: return sources headers = {'User-Agent': random_agent()} html = BeautifulSoup(requests.get(url, headers=headers, timeout=30).content) r = html.findAll('div', attrs={'class': 'site'}) for container in r: r_url = container.findAll('a')[0]['data-actuallink'].encode('utf-8') host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(r_url.strip().lower()).netloc)[0] host = replaceHTMLCodes(host) host = host.encode('utf-8') sources.append({'source': host, 'quality': 'SD', 'scraper': self.name, 'url': r_url,'direct': False}) except: pass return sources
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: print("MOVIEXK") headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) ep_id = int(episode) season_id = int(season) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'inner'}) for container in containers: print("MOVIEXK r1", container) show_link = container.findAll('a')[0] r_href = show_link['href'] print("MOVIEXK r2", r_href) r_title = show_link['title'] print("MOVIEXK r3", r_title) print("MOVIEXK r4", r_title, r_href) if cleaned_title in clean_title(r_title) and "tv" in r_title.lower(): redirect = requests.get(r_href, headers=headers, timeout=30).text r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0] r_url = r_url.encode('utf-8') links = BeautifulSoup(requests.get(r_url, headers=headers, timeout=30).content) ep_items = links.findAll('ul', attrs={'class': 'episodelist'}) for items in ep_items: ep_links = items.findAll('a') for r in ep_links: print("MOVIEXK r5", r) ep_url = r['href'].encode('utf-8') ep_title = r['title'].encode('utf-8') print("MOVIEXK r6", ep_url, ep_title) clean_ep_title = clean_title(ep_title) if "s%02de%02d" % (season_id, ep_id) in clean_ep_title or "s%02d%02d" % ( season_id, ep_id) in clean_ep_title or "s%02d%d" % ( season_id, ep_id) in clean_ep_title or "epse%d%d" % (season_id, ep_id) in clean_ep_title : return self.sources(replaceHTMLCodes(ep_url)) except: pass return []
def scrape_movie(self, title, year, imdb): try: # print("MOVIEGO INIT") headers = {'User-Agent': random_agent()} searchquery = self.search_link % (urllib.quote_plus(title), year) query = urlparse.urljoin(self.base_link, searchquery) cleaned_title = clean_title(title) html = requests.get(query, headers=headers).content html = BeautifulSoup(html) containers = html.findAll('div', attrs={'class': 'short_content'}) # print("MOVIEGO MOVIES",containers) for items in containers: href = items.findAll('a')[0]['href'] title = items.findAll('div', attrs={'class': 'short_header'})[0] if year in str(title): title = normalize(str(title)) if title == cleaned_title: return self.sources(replaceHTMLCodes(href)) except: return []
def scrape_movie(self, title, year, imdb): try: query = self.moviesearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0]) query = urlparse.urljoin(self.base_link, query) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = ['(%s)' % str(year), '(%s)' % str(int(year) + 1), '(%s)' % str(int(year) - 1)] items = html.findAll('div', attrs={'class': 'item'}) for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] if any(candidate_year in link_title for candidate_year in years): try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title(link_title): url = re.findall('(?://.+?|)(/.+)', href)[0] url = replaceHTMLCodes(url) return self.sources(url) except: pass return []
def scrape_movie(self, title, year, imdb): try: headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'cell_container'}) for container in containers: links = container.findAll('a') for link in links: link_title = link['title'] href = link['href'] if len(link_title) > 0 and len(href) > 0: parsed = re.findall('(.+?) \((\d{4})', link_title) parsed_title = parsed[0][0] parsed_years = parsed[0][1] if cleaned_title == clean_title(parsed_title) and year == parsed_years: return self.sources(replaceHTMLCodes(href)) except: pass return []
def scrape_movie(self, title, year, imdb): try: print("MOVIEXK") headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title) + "+" + str(year)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) html = BeautifulSoup( requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'inner'}) for container in containers: print("MOVIEXK r1", container) movie_link = container.findAll('a')[0] r_href = movie_link['href'] print("MOVIEXK r2", r_href) r_title = movie_link['title'] link_year = container.findAll('span', attrs={'class': 'year' })[0].findAll('a')[0].text print("MOVIEXK r3", r_title) print("MOVIEXK RESULTS", r_title, r_href) if str(year) == link_year: if cleaned_title in clean_title(r_title): redirect = requests.get(r_href, headers=headers, timeout=30).text r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0] r_url = r_url.encode('utf-8') print("MOVIEXK PLAY URL", r_url) return self.sources(replaceHTMLCodes(r_url)) except: pass return []
def sources(self, url): sources = [] try: # print("ONEMOVIES SOURCES", url) if url == None: return sources referer = url headers = {'User-Agent': random_agent()} url = url.replace('/watching.html', '') html = requests.get(url, headers=headers).content # print ("ONEMOVIES Source", html) try: url, episode = re.findall('(.+?)\?episode=(\d*)$', url)[0] except: episode = None vid_id = re.findall('-(\d+)', url)[-1] # print ("ONEMOVIES", vid_id) quality = re.findall('<span class="quality">(.*?)</span>', html) quality = str(quality) if quality == 'cam' or quality == 'ts': quality = 'CAM' elif quality == 'hd': quality = '720' else: quality = '480' try: headers = {'X-Requested-With': 'XMLHttpRequest'} headers['Referer'] = referer headers['User-Agent'] = random_agent() u = urlparse.urljoin(self.base_link, self.server_link % vid_id) # print("SERVERS", u) r = BeautifulSoup(requests.get(u, headers=headers).content) # print("SERVERS", r) containers = r.findAll('div', attrs={'class': 'les-content'}) for result in containers: links = result.findAll('a') # print("ONEMOVIES", links) for link in links: title = str(link['title']) # print("ONEMOVIES TITLE", title) if not episode == None: title = re.findall('Episode\s+(\d+):', title)[0] title = '%01d' % int(title) if title == episode: episode_id = str(link['episode-id']) # print("ONEMOVIES EPISODE", episode_id) else: continue else: episode_id = str(link['episode-id']) onclick = str(link['onclick']) key_gen = ''.join( random.choice(string.ascii_lowercase + string.digits) for x in range(16)) ################# FIX FROM MUCKY DUCK & XUNITY TALK ################ key = '87wwxtp3dqii' key2 = '7bcq9826avrbi6m49vd7shxkn985mhod' cookie = hashlib.md5(episode_id + key).hexdigest() + '=%s' % key_gen a = episode_id + key2 b = key_gen i = b[-1] h = b[:-1] b = i + h + i + h + i + h hash_id = uncensored(a, b) ################# FIX FROM MUCKY DUCK & XUNITY TALK ################ serverurl = self.base_link + '/ajax/v2_get_sources/' + episode_id + '?hash=' + urllib.quote( hash_id) # print ("playurl ONEMOVIES", serverurl) headers = { 'Accept-Language': 'en-US', 'Cookie': cookie, 'Referer': referer, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } # print ("playurl ONEMOVIES", headers) result = requests.get(serverurl, headers=headers).content # print ("RESULT ONEMOVIES", result) result = result.replace('\\', '') # print ("ONEMOVIES Result", result) url = re.findall('"?file"?\s*:\s*"(.+?)"', result) url = [googletag(i) for i in url] url = [i[0] for i in url if len(i) > 0] u = [] try: u += [[i for i in url if i['quality'] == '1080p'][0]] except: pass try: u += [[i for i in url if i['quality'] == '720'][0]] except: pass try: u += [[i for i in url if i['quality'] == '480'][0]] except: pass url = replaceHTMLCodes(u[0]['url']) quality = googletag(url)[0]['quality'] # print ("ONEMOVIES PLAY URL", quality, url) sources.append({ 'source': 'google video', 'quality': quality, 'scraper': self.name, 'url': url, 'direct': True }) except: pass except: pass return sources
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: query = urlparse.urljoin( self.base_link, self.tvsearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0])) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = [ '%s' % str(year), '%s' % str(int(year) + 1), '%s' % str(int(year) - 1) ] items = html.findAll('div', attrs={'class': 'item'}) show_url = None for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title( link_title) and show_year in link_title: url = re.findall('(?://.+?|)(/.+)', href)[0] show_url = urlparse.urljoin(self.base_link, replaceHTMLCodes(url)) else: continue html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item')) season_items = html.findAll('div', attrs={'class': 'show_season'}) for season_item in season_items: if season_item["data-id"] != season: continue episode_items = season_item.findAll( 'div', attrs={'class': 'tv_episode_item'}) for episode_item in episode_items: link = episode_item.findAll('a')[-1] href = link["href"] link_episode = link.contents[0].strip() if link_episode != "E%s" % (episode): continue link_airdate = link.findAll( 'span', attrs={'class': 'tv_num_versions'})[-1] # WTF link_airdate = link_airdate.contents[0] if any(candidate_year in link_airdate for candidate_year in years): return self.sources(href) except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): url_title = title.replace(' ', '-').replace('.', '-').replace(":","").replace("!","").replace("?","").lower() episode_url = '/%s/%01d-sezon/%01d-bolum' % (url_title, int(season), int(episode)) return self.sources(replaceHTMLCodes(episode_url))
def sources(self, url): sources = [] try: # print("ONEMOVIES SOURCES", url) if url == None: return sources referer = url headers = {'User-Agent': random_agent()} url = url.replace('/watching.html', '') html = requests.get(url, headers=headers).content # print ("ONEMOVIES Source", html) try: url, episode = re.findall('(.+?)\?episode=(\d*)$', url)[0] except: episode = None vid_id = re.findall('-(\d+)', url)[-1] # print ("ONEMOVIES", vid_id) quality = re.findall('<span class="quality">(.*?)</span>', html) quality = str(quality) if quality == 'cam' or quality == 'ts': quality = 'CAM' elif quality == 'hd': quality = '720' else: quality = '480' try: headers = {'X-Requested-With': 'XMLHttpRequest'} headers['Referer'] = referer headers['User-Agent'] = random_agent() u = urlparse.urljoin(self.base_link, self.server_link % vid_id) # print("SERVERS", u) r = BeautifulSoup(requests.get(u, headers=headers).content) # print("SERVERS", r) containers = r.findAll('div', attrs={'class': 'les-content'}) for result in containers: links = result.findAll('a') # print("ONEMOVIES", links) for link in links: title = str(link['title']) # print("ONEMOVIES TITLE", title) if not episode == None: title = re.findall('Episode\s+(\d+):', title)[0] title = '%01d' % int(title) if title == episode: episode_id = str(link['episode-id']) # print("ONEMOVIES EPISODE", episode_id) else: continue else: episode_id = str(link['episode-id']) onclick = str(link['onclick']) key_gen = ''.join(random.choice(string.ascii_lowercase + string.digits) for x in range(16)) ################# FIX FROM MUCKY DUCK & XUNITY TALK ################ key = '87wwxtp3dqii' key2 = '7bcq9826avrbi6m49vd7shxkn985mhod' cookie = hashlib.md5(episode_id + key).hexdigest() + '=%s' % key_gen a = episode_id + key2 b = key_gen i = b[-1] h = b[:-1] b = i + h + i + h + i + h hash_id = uncensored(a, b) ################# FIX FROM MUCKY DUCK & XUNITY TALK ################ serverurl = self.base_link + '/ajax/v2_get_sources/' + episode_id + '?hash=' + urllib.quote( hash_id) # print ("playurl ONEMOVIES", serverurl) headers = {'Accept-Language': 'en-US', 'Cookie': cookie, 'Referer': referer, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest'} # print ("playurl ONEMOVIES", headers) result = requests.get(serverurl, headers=headers).content # print ("RESULT ONEMOVIES", result) result = result.replace('\\', '') # print ("ONEMOVIES Result", result) url = re.findall('"?file"?\s*:\s*"(.+?)"', result) url = [googletag(i) for i in url] url = [i[0] for i in url if len(i) > 0] u = [] try: u += [[i for i in url if i['quality'] == '1080p'][0]] except: pass try: u += [[i for i in url if i['quality'] == '720'][0]] except: pass try: u += [[i for i in url if i['quality'] == '480'][0]] except: pass url = replaceHTMLCodes(u[0]['url']) quality = googletag(url)[0]['quality'] # print ("ONEMOVIES PLAY URL", quality, url) sources.append({'source': 'google video', 'quality': quality, 'scraper': self.name, 'url': url, 'direct': True}) except: pass except: pass return sources
def sources(self, url): sources = [] try: if url == None: return sources if not self.base_link in url: url = urlparse.urljoin(self.base_link, url) content = re.compile('(.+?)\?episode=\d*$').findall(url) video_type = 'movie' if len(content) == 0 else 'episode' try: url, episode = re.compile('(.+?)\?episode=(\d*)$').findall( url)[0] except: pass headers = {'User-Agent': random_agent()} html = self.scraper.get(url, headers=headers, timeout=30).content try: compressedstream = StringIO.StringIO(html) html = gzip.GzipFile(fileobj=compressedstream).read() html = BeautifulSoup(html) except: html = BeautifulSoup(html) links = html.findAll('a', attrs={'target': 'EZWebPlayer'}) for link in links: href = replaceHTMLCodes(link['href']) if not "get.php" in href: continue if video_type == 'episode': link_episode_number = re.compile('(\d+)').findall( link.string) if len(link_episode_number) > 0: link_episode_number = link_episode_number[-1] if not link_episode_number == '%01d' % int(episode): continue referer = url headers = {'User-Agent': random_agent(), 'Referer': referer} html = self.scraper.get(href, headers=headers, timeout=30).content source = re.findall('sources\s*:\s*\[(.+?)\]', html)[0] files = re.findall( '"file"\s*:\s*"(.+?)".+?"label"\s*:\s*"(.+?)"', source) if files: quality_url_pairs = [{ 'url': file[0], 'quality': file[1][:-1] } for file in files] else: files = re.findall('"file"\s*:\s*"(.+?)".+?}', source) quality_url_pairs = [{ 'url': file, 'quality': "SD" } for file in files] for pair in quality_url_pairs: sources.append({ 'source': 'google video', 'quality': pair['quality'], 'scraper': self.name, 'url': pair['url'], 'direct': True }) except: pass return sources