def process_results_page(self, html, title, artist, referer): sources = [] result = html.findAll("div", attrs={"id": "mp3list"}) for item in result: li = item.find("li", "mp3list-play") if not li: continue playlink = li.find("a")["href"] unselectable_text = li.find("div", "unselectable").contents[0] parts = unselectable_text.split("-") link_artist = parts[0].strip() link_title = parts[1].strip() if not clean_title(link_title) == clean_title(title): continue if not clean_title(artist) == clean_title(link_artist): continue label = "%s - %s" % (link_artist, link_title) sources.append({ 'source': label, 'quality': 'HD', 'scraper': self.name, 'url': playlink, 'direct': True }) return sources
def scrape_music(self, title, artist): try: # print("ONEMUSIC") headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title.replace("'", ""))) query = urlparse.urljoin(self.base_link, query) # print("ONEMUSIC", query) artist_name = clean_title(artist) song_name = clean_title(title) # print("ONEMUSIC ARTIST", artist_name) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) self.musiclist = [] containers = html.findAll('div', attrs={'class': 'sr-songs-list'}) for blocks in containers: song_block = blocks.findAll('div', attrs={'class': 'item-caption'}) for item in song_block: href = item.findAll('a')[0]['href'] song_title = item.findAll('a')[0]['title'] href = href.encode('utf-8') song_title = song_title.encode('utf-8') if clean_title(song_title) == song_name: artist_block = item.findAll('span', attrs={'class': 'singer'})[0] artist = artist_block.findAll('a')[0]['title'] artist = artist.encode('utf-8') artist = clean_title(artist) print("ONEMUSIC", href, song_title, artist_name) if artist == artist_name: print("ONEMUSIC PASSED", href, song_title, artist) return self.sources(href, "HD") except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): headers = {'User-Agent': random_agent()} q = (title.translate(None, '\/:*?"\'<>|!,')).replace(' ', '-').replace( '--', '-').lower() query = urlparse.urljoin(self.base_link, self.tv_search_link % q) cleaned_title = clean_title(title) html = BeautifulSoup( requests.get(query, headers=headers, timeout=30).content) links = html.findAll('a', attrs={'class': 'top-h1'}) show_url = None for link in links: link_title = link.text if cleaned_title == clean_title(link_title): show_url = link["href"] break if show_url: html = BeautifulSoup( requests.get(show_url, headers=headers, timeout=30).content) link_container = html.findAll("div", attrs={'class': 'bottom'})[-1] episode_links = link_container.findAll("a") episode_format1 = "S%02dE%02d" % (int(season), int(episode)) episode_format2 = "S%02d-E%02d" % (int(season), int(episode)) for episode_link in episode_links: button = episode_link.contents[0] episode_text = button.text if episode_format1 in episode_text or episode_format2 in episode_text: episode_url = episode_link["href"] return self.sources(episode_url, "SD")
def scrape_movie(self, title, year, imdb): try: headers = {'User-Agent': random_agent()} query = urlparse.urljoin(self.base_link, self.search_link) query = query % urllib.quote_plus(title) # print ("XMOVIES query", query) cleaned_title = clean_title(title) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'item_movie'}) # print ("XMOVIES r1", containers) for container in containers: try: links = container.findAll('h2', attrs={'class': 'tit'})[0] r = links.findAll('a') for link in r: link_title = link['title'].encode('utf-8') href = link['href'].encode('utf-8') if len(link_title) > 0 and len(href) > 0: parsed = re.findall('(.+?) \((\d{4})', link_title) parsed_title = parsed[0][0] parsed_years = parsed[0][1] if cleaned_title.lower() == clean_title(parsed_title).lower() and year == parsed_years: if not "http:" in href: href = "http:" + href return self.sources(replaceHTMLCodes(href)) except: pass except: pass return []
def process_results_page(self, html, title, artist, referer): sources = [] result = html.find("div", "result") for item in result.findAll("div", "item"): title_block = item.find("div", "title") link = title_block.find("a") link_href = link["href"] spans = link.findAll("span") link_artist = spans[0].text link_title = replaceHTMLCodes(spans[1].text) if not clean_title(link_title) == clean_title(title): continue if not clean_title(artist) == clean_title(link_artist): continue headers2 = headers headers2["referer"] = referer html = BS(session.get(link_href, headers=headers2).content) tab_content = html.find("div", "tab-content") music_links = tab_content.findAll("a", "red-link") for music_link in music_links: sources.append({ 'source': 'mp3', 'quality': 'HD', 'scraper': self.name, 'url': music_link["href"], 'direct': True }) return sources
def scrape_movie(self, title, year, imdb): try: # print("ONEMOVIES") headers = {'User-Agent': random_agent()} # print("ONEMOVIES", headers) query = self.search_link % (urllib.quote_plus( title.replace("'", " "))) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) # print("ONEMOVIES", query) html = BeautifulSoup( requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'ml-item'}) for result in containers: links = result.findAll('a') # print("ONEMOVIES", links) for link in links: link_title = str(link['title']) href = str(link['href']) info = str(link['data-url']) # print("ONEMOVIES", link_title, href, info) if clean_title(link_title) == cleaned_title: html = requests.get(info, headers=headers).content pattern = '<div class="jt-info">%s</div>' % year match = re.findall(pattern, html) if match: # print("ONEMOVIES MATCH", href) return self.sources(replaceHTMLCodes(href)) except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: headers = {'User-Agent': random_agent()} query = "%s+season+%s" % (urllib.quote_plus(title), season) query = self.search_link % query query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) checkseason = cleaned_title + "season" + season # print("ONEMOVIES", query,checkseason) html = BeautifulSoup( requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'ml-item'}) for result in containers: links = result.findAll('a') # print("ONEMOVIES", links) for link in links: link_title = str(link['title']) href = str(link['href']) # print("ONEMOVIES", link_title, href, info) if clean_title(link_title) == checkseason: ep_id = '?episode=%01d' % int(episode) href = href + ep_id # print("ONEMOVIES Passed", href) return self.sources(replaceHTMLCodes(href)) except: pass return []
def scrape_movie(self, title, year, imdb): try: # print("ONEMOVIES") headers = {'User-Agent': random_agent()} # print("ONEMOVIES", headers) query = self.search_link % (urllib.quote_plus(title.replace("'", " "))) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) # print("ONEMOVIES", query) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'ml-item'}) for result in containers: links = result.findAll('a') # print("ONEMOVIES", links) for link in links: link_title = str(link['title']) href = str(link['href']) info = str(link['data-url']) # print("ONEMOVIES", link_title, href, info) if clean_title(link_title) == cleaned_title: html = requests.get(info, headers=headers).content pattern = '<div class="jt-info">%s</div>' % year match = re.findall(pattern, html) if match: # print("ONEMOVIES MATCH", href) return self.sources(replaceHTMLCodes(href)) except: pass return []
def scrape_movie(self, title, year, imdb): try: print("MOVIEXK") headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title) + "+" + str(year)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'inner'}) for container in containers: print("MOVIEXK r1", container) movie_link = container.findAll('a')[0] r_href = movie_link['href'] print("MOVIEXK r2", r_href) r_title = movie_link['title'] link_year = container.findAll('span', attrs={'class': 'year'})[0].findAll('a')[0].text print("MOVIEXK r3", r_title) print("MOVIEXK RESULTS", r_title, r_href) if str(year) == link_year: if cleaned_title in clean_title(r_title): redirect = requests.get(r_href, headers=headers, timeout=30).text r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0] r_url = r_url.encode('utf-8') print("MOVIEXK PLAY URL", r_url) return self.sources(replaceHTMLCodes(r_url)) except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: headers = {'User-Agent': random_agent()} query = "%s+season+%s" % (urllib.quote_plus(title), season) query = self.search_link % query query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) checkseason = cleaned_title + "season" + season # print("ONEMOVIES", query,checkseason) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'ml-item'}) for result in containers: links = result.findAll('a') # print("ONEMOVIES", links) for link in links: link_title = str(link['title']) href = str(link['href']) # print("ONEMOVIES", link_title, href, info) if clean_title(link_title) == checkseason: ep_id = '?episode=%01d' % int(episode) href = href + ep_id # print("ONEMOVIES Passed", href) return self.sources(replaceHTMLCodes(href)) except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: query = urlparse.urljoin(self.base_link, self.tvsearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0])) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = ['%s' % str(year), '%s' % str(int(year) + 1), '%s' % str(int(year) - 1)] items = html.findAll('div', attrs={'class': 'item'}) show_url = None for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title(link_title) and show_year in link_title: url = re.findall('(?://.+?|)(/.+)', href)[0] show_url = urlparse.urljoin(self.base_link, replaceHTMLCodes(url)) else: continue html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item')) season_items = html.findAll('div', attrs={'class': 'show_season'}) for season_item in season_items: if season_item["data-id"] != season: continue episode_items = season_item.findAll('div', attrs={'class': 'tv_episode_item'}) for episode_item in episode_items: link = episode_item.findAll('a')[-1] href = link["href"] link_episode = link.contents[0].strip() if link_episode != "E%s" % (episode): continue link_airdate = link.findAll('span', attrs={'class': 'tv_num_versions'})[-1] # WTF link_airdate = link_airdate.contents[0] if any(candidate_year in link_airdate for candidate_year in years): return self.sources(href) except: pass return []
def get_url(scraper, title, show_year, year, season, episode, imdb, tvdb, type, cache_location, maximum_age): cache_enabled = xbmcaddon.Addon('script.module.nanscrapers').getSetting("cache_enabled") == 'true' try: dbcon = database.connect(cache_location) dbcur = dbcon.cursor() try: dbcur.execute("SELECT * FROM version") match = dbcur.fetchone() except: nanscrapers.clear_cache() dbcur.execute("CREATE TABLE version (""version TEXT)") dbcur.execute("INSERT INTO version Values ('0.5.4')") dbcon.commit() dbcur.execute( "CREATE TABLE IF NOT EXISTS rel_src (""scraper TEXT, ""title Text, show_year TEXT, year TEXT, ""season TEXT, ""episode TEXT, ""imdb_id TEXT, ""urls TEXT, ""added TEXT, ""UNIQUE(scraper, title, year, season, episode)"");") except: pass if cache_enabled: try: sources = [] dbcur.execute( "SELECT * FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % ( scraper.name, clean_title(title).upper(), show_year, year, season, episode)) match = dbcur.fetchone() t1 = int(re.sub('[^0-9]', '', str(match[8]))) t2 = int(datetime.datetime.now().strftime("%Y%m%d%H%M")) update = abs(t2 - t1) > maximum_age if update == False: sources = json.loads(match[7]) return sources except: pass try: sources = [] if type == "movie": sources = scraper.scrape_movie(title, year, imdb) elif type == "episode": sources = scraper.scrape_episode(title, show_year, year, season, episode, imdb, tvdb) if sources == None: sources = [] else: if cache_enabled: dbcur.execute( "DELETE FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % ( scraper.name, clean_title(title).upper(), show_year, year, season, episode)) dbcur.execute("INSERT INTO rel_src Values (?, ?, ?, ?, ?, ?, ?, ?, ?)", ( scraper.name, clean_title(title).upper(), show_year, year, season, episode, imdb, json.dumps(sources), datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))) dbcon.commit() return sources except: pass
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: print("MOVIEXK") headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) ep_id = int(episode) season_id = int(season) html = BeautifulSoup( requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'inner'}) for container in containers: print("MOVIEXK r1", container) show_link = container.findAll('a')[0] r_href = show_link['href'] print("MOVIEXK r2", r_href) r_title = show_link['title'] print("MOVIEXK r3", r_title) print("MOVIEXK r4", r_title, r_href) if cleaned_title in clean_title( r_title) and "tv" in r_title.lower(): redirect = requests.get(r_href, headers=headers, timeout=30).text r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0] r_url = r_url.encode('utf-8') links = BeautifulSoup( requests.get(r_url, headers=headers, timeout=30).content) ep_items = links.findAll('ul', attrs={'class': 'episodelist'}) for items in ep_items: ep_links = items.findAll('a') for r in ep_links: print("MOVIEXK r5", r) ep_url = r['href'].encode('utf-8') ep_title = r['title'].encode('utf-8') print("MOVIEXK r6", ep_url, ep_title) clean_ep_title = clean_title(ep_title) if "s%02de%02d" % ( season_id, ep_id) in clean_ep_title or "s%02d%02d" % ( season_id, ep_id ) in clean_ep_title or "s%02d%d" % ( season_id, ep_id ) in clean_ep_title or "epse%d%d" % ( season_id, ep_id) in clean_ep_title: return self.sources(replaceHTMLCodes(ep_url)) except: pass return []
def get_muscic_url(scraper, title, artist, cache_location, maximum_age, debrid = False): cache_enabled = xbmcaddon.Addon('script.module.nanscrapers').getSetting("cache_enabled") == 'true' try: dbcon = database.connect(cache_location) dbcur = dbcon.cursor() try: dbcur.execute("SELECT * FROM version") match = dbcur.fetchone() except: nanscrapers.clear_cache() dbcur.execute("CREATE TABLE version (""version TEXT)") dbcur.execute("INSERT INTO version Values ('0.5.4')") dbcon.commit() dbcur.execute( "CREATE TABLE IF NOT EXISTS rel_music_src (""scraper TEXT, ""title Text, ""artist TEXT, ""urls TEXT, ""added TEXT, ""UNIQUE(scraper, title, artist)"");") except: pass if cache_enabled: try: sources = [] dbcur.execute( "SELECT * FROM rel_music_src WHERE scraper = '%s' AND title = '%s' AND artist = '%s'" % ( scraper.name, clean_title(title).upper(), artist.upper())) match = dbcur.fetchone() t1 = int(re.sub('[^0-9]', '', str(match[4]))) t2 = int(datetime.datetime.now().strftime("%Y%m%d%H%M")) update = abs(t2 - t1) > maximum_age if update == False: sources = json.loads(match[3]) return sources except: pass try: sources = scraper.scrape_music(title, artist, debrid = debrid) if sources == None: sources = [] else: if cache_enabled: dbcur.execute( "DELETE FROM rel_music_src WHERE scraper = '%s' AND title = '%s' AND artist = '%s'" % ( scraper.name, clean_title(title).upper(), artist.upper)) dbcur.execute("INSERT INTO rel_music_src Values (?, ?, ?, ?, ?)", ( scraper.name, clean_title(title).upper(), artist.upper(), json.dumps(sources), datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))) dbcon.commit() return sources except: pass
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: html = BeautifulSoup(self.get_html(title, self.tvsearch_link)) index_items = html.findAll('div', attrs={'class': re.compile('index_item.+?')}) title = 'watch' + clean_title(title).replace(": ", "") for index_item in index_items: try: links = index_item.findAll('a') for link in links: href = link['href'] link_title = link['title'] try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass if title == clean_title(link_title): # href is the show page relative url show_url = urlparse.urljoin(self.base_link, href) html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item')) seasons = html.findAll('div', attrs={'class': 'show_season'}) for scraped_season in seasons: if scraped_season['data-id'] == season: tv_episode_items = scraped_season.findAll('div', attrs={'class': 'tv_episode_item'}) for tv_episode_item in tv_episode_items: links = tv_episode_item.findAll('a') for link in links: if link.contents[0].strip() == "E%s" % episode: episode_href = link['href'] try: episode_href = \ urlparse.parse_qs(urlparse.urlparse(episode_href).query)['u'][0] except: pass try: episode_href = \ urlparse.parse_qs(urlparse.urlparse(episode_href).query)['q'][0] except: pass return self.sources(episode_href) except: continue except: pass return []
def scrape_music(self, title, artist, debrid=False): try: song_search = clean_title(title.lower()).replace(' ', '+') artist_search = clean_title(artist.lower()).replace(' ', '+') song_comp = clean_title(title.lower()) artist_comp = clean_title(artist.lower()) total = artist_comp + '-' + song_comp start_url = '%sresults?search=%s+%s' % (self.base_link, artist_search, song_search) html = requests.get(start_url, headers=headers, timeout=20).content match = re.compile( '<h4 class="card-title">(.+?)</h4>.+?href="(.+?)"', re.DOTALL).findall(html) for m, link in match: match2 = m.replace('\n', '').replace('\t', '').replace(' ', '') match3 = match2.lower() quals = re.compile(str(total) + '(.+?)>').findall(str(match3) + '>') qual1 = str(quals) qual = qual1.replace("[", "").replace("]", "") if clean_title(title).lower() in clean_title(match2).lower(): if clean_title(artist).lower() in clean_title( match2).lower(): self.sources.append({ 'source': 'Youtube', 'quality': qual, 'scraper': self.name, 'url': link, 'direct': True }) return self.sources except Exception, argument: return self.sources
def scrape_movie(self, title, year, imdb): try: query = self.moviesearch_link % urllib.quote_plus( title.replace('\'', '').rsplit(':', 1)[0]) query = urlparse.urljoin(self.base_link, query) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = [ '(%s)' % str(year), '(%s)' % str(int(year) + 1), '(%s)' % str(int(year) - 1) ] items = html.findAll('div', attrs={'class': 'item'}) for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] if any(candidate_year in link_title for candidate_year in years): try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title(link_title): url = re.findall('(?://.+?|)(/.+)', href)[0] url = replaceHTMLCodes(url) return self.sources(url) except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) ep_id = int(episode) season_id = int(season) html = requests.get(query, headers=headers, timeout=30).json() results = html['series'] for item in results: r_title = item['label'].encode('utf-8') r_link = item['seo'].encode('utf-8') if cleaned_title == clean_title(r_title): r_page = self.base_link + "/" + r_link # print("WATCHEPISODES r1", r_title,r_page) r_html = BeautifulSoup( requests.get(r_page, headers=headers, timeout=30).content) r = r_html.findAll( 'div', attrs={'class': re.compile('\s*el-item\s*')}) for container in r: try: r_href = container.findAll('a')[0]['href'].encode( 'utf-8') r_title = container.findAll( 'a')[0]['title'].encode('utf-8') # print("WATCHEPISODES r3", r_href,r_title) episode_check = "[sS]%02d[eE]%02d" % (int(season), int(episode)) match = re.search(episode_check, r_title) if match: # print("WATCHEPISODES PASSED EPISODE", r_href) return self.sources(replaceHTMLCodes(r_href)) else: match2 = re.search(episode_check, r_href) if match2: # print("WATCHEPISODES PASSED EPISODE", r_href) return self.sources( replaceHTMLCodes(r_href)) except: pass except: pass return []
def scrape_movie(self, title, year, imdb): try: html = BeautifulSoup(self.get_html(title, self.moviesearch_link)) index_items = html.findAll( 'div', attrs={'class': 'index_item index_item_ie'}) title = 'watch' + clean_title(title).replace(": ", "").replace( "'", "") years = [ '(%s)' % str(year), '(%s)' % str(int(year) + 1), '(%s)' % str(int(year) - 1) ] fallback = None for index_item in index_items: try: links = index_item.findAll('a') for link in links: href = link['href'] link_title = link['title'] if any(x in link_title for x in years) or not "(" in link_title: try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['q'][0] except: pass if title.lower() == clean_title(link_title): if '(%s)' % str(year) in link_title: return self.sources(href) else: fallback = href except: continue if fallback: return self.sources(fallback) except: pass return []
def tvshow(self, url, title, season, episode): try: self.url = [] data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] year = data['year'] cleanmovie = clean_title(title) data['season'], data['episode'] = season, episode seasoncheck = "season%s" % season episode = "%01d" % int(episode) checktitle = cleanmovie + seasoncheck seasonquery = "season+%s" % season query = self.search_link % (urllib.quote_plus(title), seasonquery) query = urlparse.urljoin(self.base_link, query) link = BeautifulSoup(requests.get(query).text) r = link.findAll('div', attrs={'class': 'ml-item'}) for links in r: page_links = links.findAll('a')[0] pageurl = page_links['href'] info = page_links['rel'] title = page_links['title'] info = info.encode('utf-8') title = title.encode('utf-8') if checktitle == clean_title(title): # print("CMOVIES LINKS", pageurl,info,title) pageurl = pageurl.encode('utf-8') ep_url = pageurl + 'watch/' referer = ep_url ep_links = BeautifulSoup(requests.get(ep_url).text) r_ep = ep_links.findAll('div', attrs={'class': 'les-content'}) for item in r_ep: match = re.compile( '<a href="(.*?)" class=.*?">Episode\s*(\d+)' ).findall(item.contents) for href, ep_items in match: ep_items = '%01d' % int(ep_items) if ep_items == episode: self.url.append([href, referer]) self.Sources(self.url) except: return
def scrape_movie(self, imdb, title, year): try: self.url = [] title = getsearch(title) cleanmovie = clean_title(title) query = self.search_link % (urllib.quote_plus(title), year) query = urlparse.urljoin(self.base_link, query) link = requests.get(query).text html = BeautifulSoup(link) r = html.findAll('div', attrs={'class': 'ml-item'}) for links in r: page_links = links.findAll('a')[0] pageurl = page_links['href'] info = page_links['rel'] title = page_links['title'] info = info.encode('utf-8') title = title.encode('utf-8') # print("CMOVIES LINKS", pageurl,info,title) if cleanmovie in clean_title(title): infolink = requests.get(info).text match_year = re.search('class="jt-info">(\d{4})<', infolink) match_year = match_year.group(1) # print("CMOVIES YEAR",match_year) if year in match_year: # print("CMOVIES PASSED") pageurl = pageurl.encode('utf-8') url = pageurl + 'watch/' referer = url # print("CMOVIES PASSED",referer,url) link = BeautifulSoup(requests.get(url).text) r = link.findAll('div', attrs={'class': 'les-content'}) for item in r: try: vidlinks = item.findAll('a')[0]['href'] vidlinks = vidlinks.encode('utf-8') # print('CMOVIES SERVER LINKS',vidlinks) self.url.append([vidlinks, referer]) except: pass #print("CMOVIES PASSED LINKS", self.url) self.Sources(self.url) except: return self.url
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: print("MOVIEXK") headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) ep_id = int(episode) season_id = int(season) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'inner'}) for container in containers: print("MOVIEXK r1", container) show_link = container.findAll('a')[0] r_href = show_link['href'] print("MOVIEXK r2", r_href) r_title = show_link['title'] print("MOVIEXK r3", r_title) print("MOVIEXK r4", r_title, r_href) if cleaned_title in clean_title(r_title) and "tv" in r_title.lower(): redirect = requests.get(r_href, headers=headers, timeout=30).text r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0] r_url = r_url.encode('utf-8') links = BeautifulSoup(requests.get(r_url, headers=headers, timeout=30).content) ep_items = links.findAll('ul', attrs={'class': 'episodelist'}) for items in ep_items: ep_links = items.findAll('a') for r in ep_links: print("MOVIEXK r5", r) ep_url = r['href'].encode('utf-8') ep_title = r['title'].encode('utf-8') print("MOVIEXK r6", ep_url, ep_title) clean_ep_title = clean_title(ep_title) if "s%02de%02d" % (season_id, ep_id) in clean_ep_title or "s%02d%02d" % ( season_id, ep_id) in clean_ep_title or "s%02d%d" % ( season_id, ep_id) in clean_ep_title or "epse%d%d" % (season_id, ep_id) in clean_ep_title : return self.sources(replaceHTMLCodes(ep_url)) except: pass return []
def scrape_music(self, title, artist, debrid=False): try: song_search = clean_title(title.lower()).replace(' ','+') artist_search = clean_title(artist.lower()).replace(' ','+') start_url = '%sresults?search_query=%s+%s' %(self.base_link,artist_search,song_search) html = requests.get(start_url, headers=headers, timeout=20).content match = re.compile('<h4 class="card-title">.+?</i>(.+?)</h4>.+?id="(.+?)"',re.DOTALL).findall(html) count = 0 for m, link in match: match4 = m.replace('\n','').replace('\t','').replace(' ',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ') match5 = re.sub('&#(\d+);', '', match4) match5 = re.sub('(&#[0-9]+)([^;^0-9]+)', '\\1;\\2', match5) match5 = match5.replace('"', '\"').replace('&', '&') match5 = re.sub('\\\|/|\(|\)|\[|\]|\{|\}|-|:|;|\*|\?|"|\'|<|>|\_|\.|\?', ' ', match5) match5 = ' '.join(match5.split()) match2 = m.replace('\n','').replace('\t','').replace(' ','') if clean_title(title).lower() in clean_title(match2).lower(): if clean_title(artist).lower() in clean_title(match2).lower(): final_link = 'https://www.youtube.com/watch?v='+link count +=1 self.sources.append({'source':self.name, 'quality':'SD', 'scraper':match5, 'url':final_link, 'direct': False}) if dev_log=='true': end_time = time.time() - self.start_time send_log(self.name,end_time,count) return self.sources except Exception, argument: return self.sources
def scrape_music(self, title, artist, debrid=False): try: song_search = clean_title(title.lower()).replace(' ','+') artist_search = clean_title(artist.lower()).replace(' ','+') start_url = '%sresults?search_query=%s+%s' %(self.base_link,artist_search,song_search) html = requests.get(start_url, headers=headers, timeout=20).content match = re.compile('<h4 class="card-title">(.+?)</h4>.+?id="(.+?)"',re.DOTALL).findall(html) count = 0 for m, link in match: match4 = m.replace('\n','').replace('\t','').replace(' ',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ') match5 = re.sub('&#(\d+);', '', match4) match5 = re.sub('(&#[0-9]+)([^;^0-9]+)', '\\1;\\2', match5) match5 = match5.replace('"', '\"').replace('&', '&') match5 = re.sub('\\\|/|\(|\)|\[|\]|\{|\}|-|:|;|\*|\?|"|\'|<|>|\_|\.|\?', ' ', match5) match5 = ' '.join(match5.split()) match2 = m.replace('\n','').replace('\t','').replace(' ','') if clean_title(title).lower() in clean_title(match2).lower(): if clean_title(artist).lower() in clean_title(match2).lower(): final_link = 'https://www.youtube.com/watch?v='+link count +=1 self.sources.append({'source':self.name, 'quality':'SD', 'scraper':match5, 'url':final_link, 'direct': False}) if dev_log=='true': end_time = time.time() - self.start_time send_log(self.name,end_time,count) return self.sources except Exception, argument: return self.sources
def scrape_movie(self, title, year, imdb): try: headers = {'User-Agent': random_agent()} q = (title.translate(None, '\/:*?"\'<>|!,')).replace(' ', '-').replace( '--', '-').lower() query = urlparse.urljoin(self.base_link, self.movie_search_link % q) cleaned_title = clean_title(title) html = requests.get(query, headers=headers, timeout=30).content containers = re.compile( '<a class="top-item".*href="(.*?)"><cite>(.*?)</cite></a>' ).findall(html) for href, title in containers: parsed = re.findall('(.+?) \((\d{4})', title) parsed_title = parsed[0][0] parsed_years = parsed[0][1] if cleaned_title == clean_title( parsed_title) and year == parsed_years: try: headers = {'User-Agent': random_agent()} html = requests.get(href, headers=headers, timeout=30).content parsed_html = BeautifulSoup(html) quality_title = parsed_html.findAll( "h3", attrs={'title': re.compile("Quality of ")})[0] quality = quality_title.findAll('span')[0].text match = re.search('href="([^"]+-full-movie-[^"]+)', html) if match: url = match.group(1) return self.sources(url, "SD") except: pass except: pass return []
def scrape_movie(self, title, year, imdb): try: query = self.moviesearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0]) query = urlparse.urljoin(self.base_link, query) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = ['(%s)' % str(year), '(%s)' % str(int(year) + 1), '(%s)' % str(int(year) - 1)] items = html.findAll('div', attrs={'class': 'item'}) for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] if any(candidate_year in link_title for candidate_year in years): try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title(link_title): url = re.findall('(?://.+?|)(/.+)', href)[0] url = replaceHTMLCodes(url) return self.sources(url) except: pass return []
def scrape_movie(self, title, year, imdb): try: headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'cell_container'}) for container in containers: links = container.findAll('a') for link in links: link_title = link['title'] href = link['href'] if len(link_title) > 0 and len(href) > 0: parsed = re.findall('(.+?) \((\d{4})', link_title) parsed_title = parsed[0][0] parsed_years = parsed[0][1] if cleaned_title == clean_title(parsed_title) and year == parsed_years: return self.sources(replaceHTMLCodes(href)) except: pass return []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) ep_id = int(episode) season_id = int(season) html = requests.get(query, headers=headers, timeout=30).json() results = html['series'] for item in results: r_title = item['label'].encode('utf-8') r_link = item['seo'].encode('utf-8') if cleaned_title == clean_title(r_title): r_page = self.base_link + "/" + r_link # print("WATCHEPISODES r1", r_title,r_page) r_html = BeautifulSoup(requests.get(r_page, headers=headers, timeout=30).content) r = r_html.findAll('div', attrs={'class': re.compile('\s*el-item\s*')}) for container in r: try: r_href = container.findAll('a')[0]['href'].encode('utf-8') r_title = container.findAll('a')[0]['title'].encode('utf-8') # print("WATCHEPISODES r3", r_href,r_title) episode_check = "[sS]%02d[eE]%02d" % (int(season), int(episode)) match = re.search(episode_check, r_title) if match: # print("WATCHEPISODES PASSED EPISODE", r_href) return self.sources(replaceHTMLCodes(r_href)) else: match2 = re.search(episode_check, r_href) if match2: # print("WATCHEPISODES PASSED EPISODE", r_href) return self.sources(replaceHTMLCodes(r_href)) except: pass except: pass return []
def scrape_music(self, title, artist, debrid=False): try: # print("ONEMUSIC") headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus( title.replace("'", ""))) query = urlparse.urljoin(self.base_link, query) # print("ONEMUSIC", query) artist_name = clean_title(artist) song_name = clean_title(title) # print("ONEMUSIC ARTIST", artist_name) html = BeautifulSoup( requests.get(query, headers=headers, timeout=30).content) self.musiclist = [] containers = html.findAll('div', attrs={'class': 'sr-songs-list'}) for blocks in containers: song_block = blocks.findAll('div', attrs={'class': 'item-caption'}) for item in song_block: href = item.findAll('a')[0]['href'] song_title = item.findAll('a')[0]['title'] href = href.encode('utf-8') song_title = song_title.encode('utf-8') if clean_title(song_title) == song_name: artist_block = item.findAll('span', attrs={'class': 'singer'})[0] artist = artist_block.findAll('a')[0]['title'] artist = artist.encode('utf-8') artist = clean_title(artist) print("ONEMUSIC", href, song_title, artist_name) if artist == artist_name: print("ONEMUSIC PASSED", href, song_title, artist) return self.sources(href, "HD") except: pass return []
def scrape_movie(self, title, year, imdb): try: html = BeautifulSoup(self.get_html(title, self.moviesearch_link)) index_items = html.findAll('div', attrs={'class': 'index_item index_item_ie'}) title = 'watch' + clean_title(title).replace(": ", "").replace("'", "") years = ['(%s)' % str(year), '(%s)' % str(int(year) + 1), '(%s)' % str(int(year) - 1)] fallback = None for index_item in index_items: try: links = index_item.findAll('a') for link in links: href = link['href'] link_title = link['title'] if any(x in link_title for x in years) or not "(" in link_title: try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0] except: pass if title.lower() == clean_title(link_title): if '(%s)' % str(year) in link_title: return self.sources(href) else: fallback = href except: continue if fallback: return self.sources(fallback) except: pass return []
def scrape_movie(self, title, year, imdb): try: print("MOVIEXK") headers = {'User-Agent': random_agent()} query = self.search_link % (urllib.quote_plus(title) + "+" + str(year)) query = urlparse.urljoin(self.base_link, query) cleaned_title = clean_title(title) html = BeautifulSoup( requests.get(query, headers=headers, timeout=30).content) containers = html.findAll('div', attrs={'class': 'inner'}) for container in containers: print("MOVIEXK r1", container) movie_link = container.findAll('a')[0] r_href = movie_link['href'] print("MOVIEXK r2", r_href) r_title = movie_link['title'] link_year = container.findAll('span', attrs={'class': 'year' })[0].findAll('a')[0].text print("MOVIEXK r3", r_title) print("MOVIEXK RESULTS", r_title, r_href) if str(year) == link_year: if cleaned_title in clean_title(r_title): redirect = requests.get(r_href, headers=headers, timeout=30).text r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0] r_url = r_url.encode('utf-8') print("MOVIEXK PLAY URL", r_url) return self.sources(replaceHTMLCodes(r_url)) except: pass return []
def scrape_movie(self, title, year, imdb): try: # print("MOVIEGO INIT") headers = {'User-Agent': random_agent()} searchquery = self.search_link % (urllib.quote_plus(title), year) query = urlparse.urljoin(self.base_link, searchquery) cleaned_title = clean_title(title) html = requests.get(query, headers=headers).content html = BeautifulSoup(html) containers = html.findAll('div', attrs={'class': 'short_content'}) # print("MOVIEGO MOVIES",containers) for items in containers: href = items.findAll('a')[0]['href'] title = items.findAll('div', attrs={'class': 'short_header'})[0] if year in str(title): title = normalize(str(title)) if title == cleaned_title: return self.sources(replaceHTMLCodes(href)) except: return []
def get_url(scraper, title, show_year, year, season, episode, imdb, tvdb, type, cache_location, maximum_age, check_url = False, debrid = False): cache_enabled = xbmcaddon.Addon('script.module.nanscrapers').getSetting("cache_enabled") == 'true' try: dbcon = database.connect(cache_location) dbcur = dbcon.cursor() try: dbcur.execute("SELECT * FROM version") match = dbcur.fetchone() except: nanscrapers.clear_cache() dbcur.execute("CREATE TABLE version (""version TEXT)") dbcur.execute("INSERT INTO version Values ('0.5.4')") dbcon.commit() dbcur.execute( "CREATE TABLE IF NOT EXISTS rel_src (""scraper TEXT, ""title Text, show_year TEXT, year TEXT, ""season TEXT, ""episode TEXT, ""imdb_id TEXT, ""urls TEXT, ""added TEXT, ""UNIQUE(scraper, title, year, season, episode)"");") except: pass if cache_enabled: try: sources = [] dbcur.execute( "SELECT * FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % ( scraper.name, clean_title(title).upper(), show_year, year, season, episode)) match = dbcur.fetchone() t1 = int(re.sub('[^0-9]', '', str(match[8]))) t2 = int(datetime.datetime.now().strftime("%Y%m%d%H%M")) update = abs(t2 - t1) > maximum_age if update == False: sources = json.loads(match[7]) return sources except: pass try: sources = [] if type == "movie": sources = scraper.scrape_movie(title, year, imdb, debrid = debrid) elif type == "episode": sources = scraper.scrape_episode(title, show_year, year, season, episode, imdb, tvdb, debrid = debrid) if sources == None: sources = [] else: if cache_enabled: try: dbcur.execute( "DELETE FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % ( scraper.name, clean_title(title).upper(), show_year, year, season, episode)) dbcur.execute("INSERT INTO rel_src Values (?, ?, ?, ?, ?, ?, ?, ?, ?)", ( scraper.name, clean_title(title).upper(), show_year, year, season, episode, imdb, json.dumps(sources), datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))) dbcon.commit() except: pass if check_url: noresolver = False try: import resolveurl as urlresolver except: try: import urlresolver as urlresolver except: noresolver = True new_sources = [] from common import check_playable for source in sources: if source["direct"]: check = check_playable(source["url"]) if check: new_sources.append(source) elif not noresolver: try: hmf = urlresolver.HostedMediaFile(url=source['url'], include_disabled=False, include_universal=False) if hmf.valid_url(): resolved_url = hmf.resolve() check = check_playable(resolved_url) if check: new_sources.append(source) except: pass else: new_sources.append(source) sources = new_sources return sources except: pass
def get_url(scraper, title, show_year, year, season, episode, imdb, tvdb, type, cache_location, maximum_age, check_url=False, debrid=False): cache_enabled = xbmcaddon.Addon( 'script.module.nanscrapers').getSetting("cache_enabled") == 'true' try: dbcon = database.connect(cache_location) dbcur = dbcon.cursor() try: dbcur.execute("SELECT * FROM version") match = dbcur.fetchone() except: nanscrapers.clear_cache() dbcur.execute("CREATE TABLE version (" "version TEXT)") dbcur.execute("INSERT INTO version Values ('0.5.4')") dbcon.commit() dbcur.execute("CREATE TABLE IF NOT EXISTS rel_src (" "scraper TEXT, " "title Text, show_year TEXT, year TEXT, " "season TEXT, " "episode TEXT, " "imdb_id TEXT, " "urls TEXT, " "added TEXT, " "UNIQUE(scraper, title, year, season, episode)" ");") except: pass if cache_enabled: try: sources = [] dbcur.execute( "SELECT * FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % (scraper.name, clean_title(title).upper(), show_year, year, season, episode)) match = dbcur.fetchone() t1 = int(re.sub('[^0-9]', '', str(match[8]))) t2 = int(datetime.datetime.now().strftime("%Y%m%d%H%M")) update = abs(t2 - t1) > maximum_age if update == False: sources = json.loads(match[7]) return sources except: pass try: sources = [] if type == "movie": sources = scraper.scrape_movie(title, year, imdb, debrid=debrid) elif type == "episode": sources = scraper.scrape_episode(title, show_year, year, season, episode, imdb, tvdb, debrid=debrid) if sources == None: sources = [] else: if cache_enabled: dbcur.execute( "DELETE FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % (scraper.name, clean_title(title).upper(), show_year, year, season, episode)) dbcur.execute( "INSERT INTO rel_src Values (?, ?, ?, ?, ?, ?, ?, ?, ?)", (scraper.name, clean_title(title).upper(), show_year, year, season, episode, imdb, json.dumps(sources), datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))) dbcon.commit() if check_url: noresolver = False try: import urlresolver except: try: import urlresolver9 as urlresolver except: noresolver = True new_sources = [] from common import check_playable for source in sources: if source["direct"]: check = check_playable(source["url"]) if check: new_sources.append(source) elif not noresolver: try: hmf = urlresolver.HostedMediaFile( url=source['url'], include_disabled=False, include_universal=False) if hmf.valid_url(): resolved_url = hmf.resolve() check = check_playable(resolved_url) if check: new_sources.append(source) except: pass else: new_sources.append(source) sources = new_sources return sources except: pass
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: query = urlparse.urljoin( self.base_link, self.tvsearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0])) html = proxy.get(query, 'item') if 'page=2' in html or 'page%3D2' in html: html2 = proxy.get(query + '&page=2', 'item') html += html2 html = BeautifulSoup(html) cleaned_title = 'watchputlocker' + clean_title(title) years = [ '%s' % str(year), '%s' % str(int(year) + 1), '%s' % str(int(year) - 1) ] items = html.findAll('div', attrs={'class': 'item'}) show_url = None for item in items: links = item.findAll('a') for link in links: href = link['href'] link_title = link['title'] try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['q'][0] except: pass if cleaned_title == clean_title( link_title) and show_year in link_title: url = re.findall('(?://.+?|)(/.+)', href)[0] show_url = urlparse.urljoin(self.base_link, replaceHTMLCodes(url)) else: continue html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item')) season_items = html.findAll('div', attrs={'class': 'show_season'}) for season_item in season_items: if season_item["data-id"] != season: continue episode_items = season_item.findAll( 'div', attrs={'class': 'tv_episode_item'}) for episode_item in episode_items: link = episode_item.findAll('a')[-1] href = link["href"] link_episode = link.contents[0].strip() if link_episode != "E%s" % (episode): continue link_airdate = link.findAll( 'span', attrs={'class': 'tv_num_versions'})[-1] # WTF link_airdate = link_airdate.contents[0] if any(candidate_year in link_airdate for candidate_year in years): return self.sources(href) except: pass return []
def get_url(scraper, title, show_year, year, season, episode, imdb, tvdb, type, cache_location, maximum_age): cache_enabled = xbmcaddon.Addon( 'script.module.nanscrapers').getSetting("cache_enabled") == 'true' try: dbcon = database.connect(cache_location) dbcur = dbcon.cursor() try: dbcur.execute("SELECT * FROM version") match = dbcur.fetchone() except: nanscrapers.clear_cache() dbcur.execute("CREATE TABLE version (" "version TEXT)") dbcur.execute("INSERT INTO version Values ('0.5.4')") dbcon.commit() dbcur.execute("CREATE TABLE IF NOT EXISTS rel_src (" "scraper TEXT, " "title Text, show_year TEXT, year TEXT, " "season TEXT, " "episode TEXT, " "imdb_id TEXT, " "urls TEXT, " "added TEXT, " "UNIQUE(scraper, title, year, season, episode)" ");") except: pass if cache_enabled: try: sources = [] dbcur.execute( "SELECT * FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % (scraper.name, clean_title(title).upper(), show_year, year, season, episode)) match = dbcur.fetchone() t1 = int(re.sub('[^0-9]', '', str(match[8]))) t2 = int(datetime.datetime.now().strftime("%Y%m%d%H%M")) update = abs(t2 - t1) > maximum_age if update == False: sources = json.loads(match[7]) return sources except: pass try: sources = [] if type == "movie": sources = scraper.scrape_movie(title, year, imdb) elif type == "episode": sources = scraper.scrape_episode(title, show_year, year, season, episode, imdb, tvdb) if sources == None: sources = [] else: if cache_enabled: dbcur.execute( "DELETE FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % (scraper.name, clean_title(title).upper(), show_year, year, season, episode)) dbcur.execute( "INSERT INTO rel_src Values (?, ?, ?, ?, ?, ?, ?, ?, ?)", (scraper.name, clean_title(title).upper(), show_year, year, season, episode, imdb, json.dumps(sources), datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))) dbcon.commit() return sources except: pass
def test(): global movies, shows try: test_movies = [] test_episodes = [] profile_path = xbmc.translatePath(xbmcaddon.Addon().getAddonInfo('profile')).decode('utf-8') test_file = xbmcvfs.File(os.path.join(profile_path, "testings.xml")) xml = BeautifulStoneSoup(test_file.read()) test_file.close() items = xml.findAll("item") for item in items: try: content = item.find("content") if content: if "movie" in content.text: meta = item.find("meta") test_movies.append({ 'title': meta.find("title").text, 'imdb': meta.find("imdb").text, 'year': meta.find("year").text, }) elif "episode" in content.text: meta = item.find("meta") test_episodes.append({ 'title': meta.find("tvshowtitle").text, 'show_year': int(meta.find("premiered").text[0:4]), 'year': meta.find("year").text, 'season': meta.find("season").text, 'episode': meta.find("season").text, 'imdb': meta.find("imdb").text, }) except: pass movies = test_movies shows = test_episodes except: pass dialog = xbmcgui.Dialog() pDialog = xbmcgui.DialogProgress() if dialog.yesno("NaNscrapers Testing Mode", 'Clear cache?'): nanscrapers.clear_cache() try: dbcon = database.connect(os.path.join( xbmc.translatePath(xbmcaddon.Addon("script.module.nanscrapers").getAddonInfo('profile')).decode('utf-8'), 'url_cache.db')) dbcur = dbcon.cursor() except: dialog.ok("NaNscrapers Testing Mode", 'Error connecting to db') sys.exit() num_movies = len(movies) if num_movies > 0: pDialog.create('NaNscrapers Testing mode active', 'please wait') index = 0 for movie in movies: index += 1 title = movie['title'] year = movie['year'] imdb = movie['imdb'] if pDialog.iscanceled(): pDialog.close() break pDialog.update((index / num_movies) * 100, "Scraping movie {} of {}".format(index, num_movies), title) links_scraper = nanscrapers.scrape_movie(title, year, imdb) links_scraper = links_scraper() for scraper_links in links_scraper: if pDialog.iscanceled(): break if scraper_links: random.shuffle(scraper_links) pDialog.close() dbcur.execute("SELECT COUNT(DISTINCT(scraper)) FROM rel_src where episode = ''") match = dbcur.fetchone() num_movie_scrapers = match[0] dbcur.execute("SELECT scraper, count(distinct(urls)) FROM rel_src where episode = '' group by scraper") matches = dbcur.fetchall() failed = [] for match in matches: if int(match[1]) <= 1: failed.append(match[0]) if len(failed) > 0: failedstring = "Failed: {}".format(len(failed)) for fail in failed: failedstring += "\n - {}".format(str(fail)) else: failedstring = "" dbcur.execute("SELECT title, count(distinct(urls)) FROM rel_src where episode = '' group by title") matches = dbcur.fetchall() failed_movies = [] for match in matches: if int(match[1]) <= 1: if int(match[1]) == 1: dbcur.execute( "SELECT scraper, urls FROM rel_src where episode == '' and title == '{}' group by scraper".format( match[0])) new_matches = dbcur.fetchall() found = False for new_match in new_matches: if new_match[1] == "[]": continue else: found = True if not found: failed_movies.append(match[0]) else: failed_movies.append(match[0]) if len(failed_movies) > 0: failed_movie_string = "Failed movies: {}".format(len(failed_movies)) for fail in failed_movies: for movie in movies: if clean_title(movie['title']).upper() == str(fail): failed_movie_string += "\n - {}".format(movie["title"]) else: failed_movie_string = "" num_shows = len(shows) if num_shows > 0: pDialog.create('NaNscrapers Testing mode active', 'please wait') index = 0 for show in shows: index += 1 title = show['title'] show_year = show['show_year'] year = show['year'] season = show['season'] episode = show['episode'] imdb = show['imdb'] tvdb = show.get('tvdb', '') if pDialog.iscanceled(): pDialog.close() break pDialog.update((index / num_shows) * 100, "Scraping show {} of {}".format(index, num_shows), title) links_scraper = nanscrapers.scrape_episode(title, show_year, year, season, episode, imdb, tvdb) links_scraper = links_scraper() for scraper_links in links_scraper: if pDialog.iscanceled(): break if scraper_links: random.shuffle(scraper_links) pDialog.close() dbcur.execute("SELECT COUNT(DISTINCT(scraper)) FROM rel_src where episode != ''") match = dbcur.fetchone() num_show_scrapers = match[0] dbcur.execute("SELECT scraper, count(distinct(urls)) FROM rel_src where episode != '' group by scraper") matches = dbcur.fetchall() failed = [] for match in matches: if int(match[1]) <= 1: if int(match[1]) == 1: dbcur.execute( "SELECT scraper, urls FROM rel_src where episode != '' and scraper == '{}' group by scraper".format( match[0])) match = dbcur.fetchone() if match[1] == "[]": failed.append(match[0]) else: failed.append(match[0]) if len(failed) > 0: show_scraper_failedstring = "Failed: {}".format(len(failed)) for fail in failed: show_scraper_failedstring += "\n - {}".format(str(fail)) else: show_scraper_failedstring = "" dbcur.execute("SELECT title, count(distinct(urls)) FROM rel_src where episode != '' group by title") matches = dbcur.fetchall() failed_shows = [] for match in matches: if int(match[1]) <= 1: if int(match[1]) == 1: dbcur.execute( "SELECT scraper, urls FROM rel_src where episode != '' and title == '{}' group by scraper".format( match[0])) new_matches = dbcur.fetchall() found = False for new_match in new_matches: if new_match[1] == "[]": continue else: found = True if not found: failed_shows.append(match[0]) else: failed_shows.append(match[0]) if len(failed_shows) > 0: failed_show_string = "Failed shows: {}".format(len(failed_shows)) for fail in failed_shows: for show in shows: if clean_title(show['title']).upper() == str(fail): failed_show_string += "\n - {} S{}-E{}".format(show["title"], show["season"], show["episode"]) else: failed_show_string = "" resultstring = 'Results:\n' if num_movies > 0: resultstring = resultstring + \ ' Movie Scrapers: {}\n' \ ' {}\n' \ ' {}\n'.format(num_movie_scrapers, failedstring, failed_movie_string) if num_shows > 0: resultstring = resultstring + \ ' Episode Scrapers: {}\n' \ ' {}\n' \ ' {}\n'.format(num_show_scrapers, show_scraper_failedstring, failed_show_string) dialog.textviewer("NaNscrapers Testing Mode", resultstring)
def test(): global movies, shows try: test_movies = [] test_episodes = [] profile_path = xbmc.translatePath( xbmcaddon.Addon().getAddonInfo('profile')).decode('utf-8') test_file = xbmcvfs.File(os.path.join(profile_path, "testings.xml")) xml = BeautifulStoneSoup(test_file.read()) test_file.close() items = xml.findAll("item") for item in items: try: content = item.find("content") if content: if "movie" in content.text: meta = item.find("meta") test_movies.append({ 'title': meta.find("title").text, 'imdb': meta.find("imdb").text, 'year': meta.find("year").text, }) elif "episode" in content.text: meta = item.find("meta") test_episodes.append({ 'title': meta.find("tvshowtitle").text, 'show_year': int(meta.find("premiered").text[0:4]), 'year': meta.find("year").text, 'season': meta.find("season").text, 'episode': meta.find("season").text, 'imdb': meta.find("imdb").text, }) except: pass movies = test_movies shows = test_episodes except: pass dialog = xbmcgui.Dialog() pDialog = xbmcgui.DialogProgress() if dialog.yesno("NaNscrapers Testing Mode", 'Clear cache?'): nanscrapers.clear_cache() try: dbcon = database.connect( os.path.join( xbmc.translatePath( xbmcaddon.Addon("script.module.nanscrapers").getAddonInfo( 'profile')).decode('utf-8'), 'url_cache.db')) dbcur = dbcon.cursor() except: dialog.ok("NaNscrapers Testing Mode", 'Error connecting to db') sys.exit() num_movies = len(movies) if num_movies > 0: pDialog.create('NaNscrapers Testing mode active', 'please wait') index = 0 for movie in movies: index += 1 title = movie['title'] year = movie['year'] imdb = movie['imdb'] if pDialog.iscanceled(): pDialog.close() break pDialog.update((index / num_movies) * 100, "Scraping movie {} of {}".format(index, num_movies), title) links_scraper = nanscrapers.scrape_movie(title, year, imdb) links_scraper = links_scraper() for scraper_links in links_scraper: if pDialog.iscanceled(): break if scraper_links: random.shuffle(scraper_links) pDialog.close() dbcur.execute( "SELECT COUNT(DISTINCT(scraper)) FROM rel_src where episode = ''") match = dbcur.fetchone() num_movie_scrapers = match[0] dbcur.execute( "SELECT scraper, count(distinct(urls)) FROM rel_src where episode = '' group by scraper" ) matches = dbcur.fetchall() failed = [] for match in matches: if int(match[1]) <= 1: failed.append(match[0]) if len(failed) > 0: failedstring = "Failed: {}".format(len(failed)) for fail in failed: failedstring += "\n - {}".format(str(fail)) else: failedstring = "" dbcur.execute( "SELECT title, count(distinct(urls)) FROM rel_src where episode = '' group by title" ) matches = dbcur.fetchall() failed_movies = [] for match in matches: if int(match[1]) <= 1: if int(match[1]) == 1: dbcur.execute( "SELECT scraper, urls FROM rel_src where episode == '' and title == '{}' group by scraper" .format(match[0])) new_matches = dbcur.fetchall() found = False for new_match in new_matches: if new_match[1] == "[]": continue else: found = True if not found: failed_movies.append(match[0]) else: failed_movies.append(match[0]) if len(failed_movies) > 0: failed_movie_string = "Failed movies: {}".format( len(failed_movies)) for fail in failed_movies: for movie in movies: if clean_title(movie['title']).upper() == str(fail): failed_movie_string += "\n - {}".format( movie["title"]) else: failed_movie_string = "" num_shows = len(shows) if num_shows > 0: pDialog.create('NaNscrapers Testing mode active', 'please wait') index = 0 for show in shows: index += 1 title = show['title'] show_year = show['show_year'] year = show['year'] season = show['season'] episode = show['episode'] imdb = show['imdb'] tvdb = show.get('tvdb', '') if pDialog.iscanceled(): pDialog.close() break pDialog.update((index / num_shows) * 100, "Scraping show {} of {}".format(index, num_shows), title) links_scraper = nanscrapers.scrape_episode(title, show_year, year, season, episode, imdb, tvdb) links_scraper = links_scraper() for scraper_links in links_scraper: if pDialog.iscanceled(): break if scraper_links: random.shuffle(scraper_links) pDialog.close() dbcur.execute( "SELECT COUNT(DISTINCT(scraper)) FROM rel_src where episode != ''") match = dbcur.fetchone() num_show_scrapers = match[0] dbcur.execute( "SELECT scraper, count(distinct(urls)) FROM rel_src where episode != '' group by scraper" ) matches = dbcur.fetchall() failed = [] for match in matches: if int(match[1]) <= 1: if int(match[1]) == 1: dbcur.execute( "SELECT scraper, urls FROM rel_src where episode != '' and scraper == '{}' group by scraper" .format(match[0])) match = dbcur.fetchone() if match[1] == "[]": failed.append(match[0]) else: failed.append(match[0]) if len(failed) > 0: show_scraper_failedstring = "Failed: {}".format(len(failed)) for fail in failed: show_scraper_failedstring += "\n - {}".format(str(fail)) else: show_scraper_failedstring = "" dbcur.execute( "SELECT title, count(distinct(urls)) FROM rel_src where episode != '' group by title" ) matches = dbcur.fetchall() failed_shows = [] for match in matches: if int(match[1]) <= 1: if int(match[1]) == 1: dbcur.execute( "SELECT scraper, urls FROM rel_src where episode != '' and title == '{}' group by scraper" .format(match[0])) new_matches = dbcur.fetchall() found = False for new_match in new_matches: if new_match[1] == "[]": continue else: found = True if not found: failed_shows.append(match[0]) else: failed_shows.append(match[0]) if len(failed_shows) > 0: failed_show_string = "Failed shows: {}".format(len(failed_shows)) for fail in failed_shows: for show in shows: if clean_title(show['title']).upper() == str(fail): failed_show_string += "\n - {} S{}-E{}".format( show["title"], show["season"], show["episode"]) else: failed_show_string = "" resultstring = 'Results:\n' if num_movies > 0: resultstring = resultstring + \ ' Movie Scrapers: {}\n' \ ' {}\n' \ ' {}\n'.format(num_movie_scrapers, failedstring, failed_movie_string) if num_shows > 0: resultstring = resultstring + \ ' Episode Scrapers: {}\n' \ ' {}\n' \ ' {}\n'.format(num_show_scrapers, show_scraper_failedstring, failed_show_string) dialog.textviewer("NaNscrapers Testing Mode", resultstring)
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb): try: html = BeautifulSoup(self.get_html(title, self.tvsearch_link)) index_items = html.findAll( 'div', attrs={'class': re.compile('index_item.+?')}) title = 'watch' + clean_title(title).replace(": ", "") for index_item in index_items: try: links = index_item.findAll('a') for link in links: href = link['href'] link_title = link['title'] try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['u'][0] except: pass try: href = urlparse.parse_qs( urlparse.urlparse(href).query)['q'][0] except: pass if title == clean_title( link_title ): # href is the show page relative url show_url = urlparse.urljoin(self.base_link, href) html = BeautifulSoup( proxy.get(show_url, 'tv_episode_item')) seasons = html.findAll( 'div', attrs={'class': 'show_season'}) for scraped_season in seasons: if scraped_season['data-id'] == season: tv_episode_items = scraped_season.findAll( 'div', attrs={'class': 'tv_episode_item'}) for tv_episode_item in tv_episode_items: links = tv_episode_item.findAll('a') for link in links: if link.contents[0].strip( ) == "E%s" % episode: episode_href = link['href'] try: episode_href = \ urlparse.parse_qs(urlparse.urlparse(episode_href).query)['u'][0] except: pass try: episode_href = \ urlparse.parse_qs(urlparse.urlparse(episode_href).query)['q'][0] except: pass return self.sources( episode_href) except: continue except: pass return []