Python clean_titleの例、nanscrapers.common.clean_title Pythonの例

コード例 #1

0

ファイルを表示

ファイル: mp3clan.py プロジェクト: staycanuca/plugin.video.archangel

 def process_results_page(self, html, title, artist, referer):
     sources = []
     result = html.findAll("div", attrs={"id": "mp3list"})
     for item in result:
         li = item.find("li", "mp3list-play")
         if not li:
             continue
         playlink = li.find("a")["href"]
         unselectable_text = li.find("div", "unselectable").contents[0]
         parts = unselectable_text.split("-")
         link_artist = parts[0].strip()
         link_title = parts[1].strip()
         if not clean_title(link_title) == clean_title(title):
             continue
         if not clean_title(artist) == clean_title(link_artist):
             continue
         label = "%s - %s" % (link_artist, link_title)
         sources.append({
             'source': label,
             'quality': 'HD',
             'scraper': self.name,
             'url': playlink,
             'direct': True
         })
     return sources

コード例 #2

0

ファイルを表示

ファイル: onemusic.py プロジェクト: noobsandnerds/noobsandnerds

    def scrape_music(self, title, artist):
        try:
            # print("ONEMUSIC")
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(title.replace("'", "")))
            query = urlparse.urljoin(self.base_link, query)
            # print("ONEMUSIC", query)
            artist_name = clean_title(artist)
            song_name = clean_title(title)
            # print("ONEMUSIC ARTIST", artist_name)
            html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)
            self.musiclist = []
            containers = html.findAll('div', attrs={'class': 'sr-songs-list'})
            for blocks in containers:
                song_block = blocks.findAll('div', attrs={'class': 'item-caption'})
                for item in song_block:
                    href = item.findAll('a')[0]['href']
                    song_title = item.findAll('a')[0]['title']
                    href = href.encode('utf-8')
                    song_title = song_title.encode('utf-8')
                    if clean_title(song_title) == song_name:
                        artist_block = item.findAll('span', attrs={'class': 'singer'})[0]
                        artist = artist_block.findAll('a')[0]['title']
                        artist = artist.encode('utf-8')
                        artist = clean_title(artist)
                        print("ONEMUSIC", href, song_title, artist_name)
                        if artist == artist_name:
                            print("ONEMUSIC PASSED", href, song_title, artist)
                            return self.sources(href, "HD")


        except:
            pass
        return []

コード例 #3

0

ファイルを表示

ファイル: mfree.py プロジェクト: 24061993/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb,
                       tvdb):
        headers = {'User-Agent': random_agent()}
        q = (title.translate(None, '\/:*?"\'<>|!,')).replace(' ', '-').replace(
            '--', '-').lower()
        query = urlparse.urljoin(self.base_link, self.tv_search_link % q)
        cleaned_title = clean_title(title)
        html = BeautifulSoup(
            requests.get(query, headers=headers, timeout=30).content)

        links = html.findAll('a', attrs={'class': 'top-h1'})
        show_url = None

        for link in links:
            link_title = link.text
            if cleaned_title == clean_title(link_title):
                show_url = link["href"]
                break

        if show_url:
            html = BeautifulSoup(
                requests.get(show_url, headers=headers, timeout=30).content)
            link_container = html.findAll("div", attrs={'class': 'bottom'})[-1]
            episode_links = link_container.findAll("a")
            episode_format1 = "S%02dE%02d" % (int(season), int(episode))
            episode_format2 = "S%02d-E%02d" % (int(season), int(episode))
            for episode_link in episode_links:
                button = episode_link.contents[0]
                episode_text = button.text
                if episode_format1 in episode_text or episode_format2 in episode_text:
                    episode_url = episode_link["href"]
                    return self.sources(episode_url, "SD")

コード例 #4

0

ファイルを表示

ファイル: xmovies.py プロジェクト: 24061993/noobsandnerds

 def scrape_movie(self, title, year, imdb):
     try:
         headers = {'User-Agent': random_agent()}
         query = urlparse.urljoin(self.base_link, self.search_link)
         query = query % urllib.quote_plus(title)
         # print ("XMOVIES query", query)
         cleaned_title = clean_title(title)
         html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)
         containers = html.findAll('div', attrs={'class': 'item_movie'})
         # print ("XMOVIES r1", containers)
         for container in containers:
             try:
                 links = container.findAll('h2', attrs={'class': 'tit'})[0]
                 r = links.findAll('a')
                 for link in r:
                     link_title = link['title'].encode('utf-8')
                     href = link['href'].encode('utf-8')
                     if len(link_title) > 0 and len(href) > 0:
                         parsed = re.findall('(.+?) \((\d{4})', link_title)
                         parsed_title = parsed[0][0]
                         parsed_years = parsed[0][1]
                         if cleaned_title.lower() == clean_title(parsed_title).lower() and year == parsed_years:
                             if not "http:" in href: href = "http:" + href
                             return self.sources(replaceHTMLCodes(href))
             except:
                 pass
     except:
         pass
     return []

コード例 #5

0

ファイルを表示

 def process_results_page(self, html, title, artist, referer):
     sources = []
     result = html.find("div", "result")
     for item in result.findAll("div", "item"):
         title_block = item.find("div", "title")
         link = title_block.find("a")
         link_href = link["href"]
         spans = link.findAll("span")
         link_artist = spans[0].text
         link_title = replaceHTMLCodes(spans[1].text)
         if not clean_title(link_title) == clean_title(title):
             continue
         if not clean_title(artist) == clean_title(link_artist):
             continue
         headers2 = headers
         headers2["referer"] = referer
         html = BS(session.get(link_href, headers=headers2).content)
         tab_content = html.find("div", "tab-content")
         music_links = tab_content.findAll("a", "red-link")
         for music_link in music_links:
             sources.append({
                 'source': 'mp3',
                 'quality': 'HD',
                 'scraper': self.name,
                 'url': music_link["href"],
                 'direct': True
             })
         return sources

コード例 #6

0

ファイルを表示

    def scrape_movie(self, title, year, imdb):
        try:
            # print("ONEMOVIES")
            headers = {'User-Agent': random_agent()}
            # print("ONEMOVIES", headers)
            query = self.search_link % (urllib.quote_plus(
                title.replace("'", " ")))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            # print("ONEMOVIES", query)
            html = BeautifulSoup(
                requests.get(query, headers=headers, timeout=30).content)
            containers = html.findAll('div', attrs={'class': 'ml-item'})
            for result in containers:
                links = result.findAll('a')
                # print("ONEMOVIES", links)
                for link in links:
                    link_title = str(link['title'])
                    href = str(link['href'])
                    info = str(link['data-url'])
                    # print("ONEMOVIES", link_title, href, info)
                    if clean_title(link_title) == cleaned_title:
                        html = requests.get(info, headers=headers).content
                        pattern = '<div class="jt-info">%s</div>' % year
                        match = re.findall(pattern, html)
                        if match:
                            # print("ONEMOVIES MATCH", href)
                            return self.sources(replaceHTMLCodes(href))

        except:
            pass
        return []

コード例 #7

0

ファイルを表示

    def scrape_episode(self, title, show_year, year, season, episode, imdb,
                       tvdb):
        try:
            headers = {'User-Agent': random_agent()}
            query = "%s+season+%s" % (urllib.quote_plus(title), season)
            query = self.search_link % query
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            checkseason = cleaned_title + "season" + season
            # print("ONEMOVIES", query,checkseason)
            html = BeautifulSoup(
                requests.get(query, headers=headers, timeout=30).content)
            containers = html.findAll('div', attrs={'class': 'ml-item'})
            for result in containers:
                links = result.findAll('a')
                # print("ONEMOVIES", links)
                for link in links:
                    link_title = str(link['title'])
                    href = str(link['href'])

                    # print("ONEMOVIES", link_title, href, info)
                    if clean_title(link_title) == checkseason:
                        ep_id = '?episode=%01d' % int(episode)
                        href = href + ep_id
                        # print("ONEMOVIES Passed", href)
                        return self.sources(replaceHTMLCodes(href))

        except:
            pass
        return []

コード例 #8

0

ファイルを表示

ファイル: onemovies.py プロジェクト: noobsandnerds/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            # print("ONEMOVIES")
            headers = {'User-Agent': random_agent()}
            # print("ONEMOVIES", headers)
            query = self.search_link % (urllib.quote_plus(title.replace("'", " ")))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            # print("ONEMOVIES", query)
            html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)
            containers = html.findAll('div', attrs={'class': 'ml-item'})
            for result in containers:
                links = result.findAll('a')
                # print("ONEMOVIES", links)
                for link in links:
                    link_title = str(link['title'])
                    href = str(link['href'])
                    info = str(link['data-url'])
                    # print("ONEMOVIES", link_title, href, info)
                    if clean_title(link_title) == cleaned_title:
                        html = requests.get(info, headers=headers).content
                        pattern = '<div class="jt-info">%s</div>' % year
                        match = re.findall(pattern, html)
                        if match:
                            # print("ONEMOVIES MATCH", href)
                            return self.sources(replaceHTMLCodes(href))



        except:
            pass
        return []

コード例 #9

0

ファイルを表示

ファイル: moviexk.py プロジェクト: noobsandnerds/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            print("MOVIEXK")
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(title) + "+" + str(year))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)

            containers = html.findAll('div', attrs={'class': 'inner'})
            for container in containers:
                print("MOVIEXK r1", container)
                movie_link = container.findAll('a')[0]
                r_href = movie_link['href']
                print("MOVIEXK r2", r_href)
                r_title = movie_link['title']
                link_year = container.findAll('span', attrs={'class': 'year'})[0].findAll('a')[0].text
                print("MOVIEXK r3", r_title)
                print("MOVIEXK RESULTS", r_title, r_href)
                if str(year) == link_year:
                    if cleaned_title in clean_title(r_title):
                        redirect = requests.get(r_href, headers=headers, timeout=30).text
                        r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0]
                        r_url = r_url.encode('utf-8')
                        print("MOVIEXK PLAY URL", r_url)
                        return self.sources(replaceHTMLCodes(r_url))
        except:
            pass
        return []

コード例 #10

0

ファイルを表示

ファイル: onemovies.py プロジェクト: noobsandnerds/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb):
        try:
            headers = {'User-Agent': random_agent()}
            query = "%s+season+%s" % (urllib.quote_plus(title), season)
            query = self.search_link % query
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            checkseason = cleaned_title + "season" + season
            # print("ONEMOVIES", query,checkseason)
            html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)
            containers = html.findAll('div', attrs={'class': 'ml-item'})
            for result in containers:
                links = result.findAll('a')
                # print("ONEMOVIES", links)
                for link in links:
                    link_title = str(link['title'])
                    href = str(link['href'])

                    # print("ONEMOVIES", link_title, href, info)
                    if clean_title(link_title) == checkseason:
                        ep_id = '?episode=%01d' % int(episode)
                        href = href + ep_id
                        # print("ONEMOVIES Passed", href)
                        return self.sources(replaceHTMLCodes(href))

        except:
            pass
        return []

コード例 #11

0

ファイルを表示

ファイル: watchfree.py プロジェクト: noobsandnerds/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb):
        try:
            query = urlparse.urljoin(self.base_link,
                                     self.tvsearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0]))

            html = proxy.get(query, 'item')
            if 'page=2' in html or 'page%3D2' in html:
                html2 = proxy.get(query + '&page=2', 'item')
                html += html2

            html = BeautifulSoup(html)

            cleaned_title = 'watchputlocker' + clean_title(title)
            years = ['%s' % str(year), '%s' % str(int(year) + 1), '%s' % str(int(year) - 1)]

            items = html.findAll('div', attrs={'class': 'item'})

            show_url = None
            for item in items:
                links = item.findAll('a')
                for link in links:
                    href = link['href']
                    link_title = link['title']
                    try:
                        href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0]
                    except:
                        pass
                    try:
                        href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0]
                    except:
                        pass
                    if cleaned_title == clean_title(link_title) and show_year in link_title:
                        url = re.findall('(?://.+?|)(/.+)', href)[0]
                        show_url = urlparse.urljoin(self.base_link, replaceHTMLCodes(url))
                    else:
                        continue

                    html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item'))
                    season_items = html.findAll('div', attrs={'class': 'show_season'})
                    for season_item in season_items:
                        if season_item["data-id"] != season:
                            continue
                        episode_items = season_item.findAll('div', attrs={'class': 'tv_episode_item'})
                        for episode_item in episode_items:
                            link = episode_item.findAll('a')[-1]
                            href = link["href"]
                            link_episode = link.contents[0].strip()
                            if link_episode != "E%s" % (episode):
                                continue
                            link_airdate = link.findAll('span', attrs={'class': 'tv_num_versions'})[-1]  # WTF
                            link_airdate = link_airdate.contents[0]
                            if any(candidate_year in link_airdate for candidate_year in years):
                                return self.sources(href)

        except:
            pass
        return []

コード例 #12

0

ファイルを表示

ファイル: hl.py プロジェクト: noobsandnerds/noobsandnerds

    def get_url(scraper, title, show_year, year, season, episode, imdb, tvdb, type, cache_location, maximum_age):
        cache_enabled = xbmcaddon.Addon('script.module.nanscrapers').getSetting("cache_enabled") == 'true'
        try:
            dbcon = database.connect(cache_location)
            dbcur = dbcon.cursor()
            try:
                dbcur.execute("SELECT * FROM version")
                match = dbcur.fetchone()
            except:
                nanscrapers.clear_cache()
                dbcur.execute("CREATE TABLE version (""version TEXT)")
                dbcur.execute("INSERT INTO version Values ('0.5.4')")
                dbcon.commit()

            dbcur.execute(
                "CREATE TABLE IF NOT EXISTS rel_src (""scraper TEXT, ""title Text, show_year TEXT, year TEXT, ""season TEXT, ""episode TEXT, ""imdb_id TEXT, ""urls TEXT, ""added TEXT, ""UNIQUE(scraper, title, year, season, episode)"");")
        except:
            pass

        if cache_enabled:
            try:
                sources = []
                dbcur.execute(
                    "SELECT * FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % (
                        scraper.name, clean_title(title).upper(), show_year, year, season, episode))
                match = dbcur.fetchone()
                t1 = int(re.sub('[^0-9]', '', str(match[8])))
                t2 = int(datetime.datetime.now().strftime("%Y%m%d%H%M"))
                update = abs(t2 - t1) > maximum_age
                if update == False:
                    sources = json.loads(match[7])
                    return sources
            except:
                pass

        try:
            sources = []
            if type == "movie":
                sources = scraper.scrape_movie(title, year, imdb)
            elif type == "episode":
                sources = scraper.scrape_episode(title, show_year, year, season, episode, imdb, tvdb)
            if sources == None:
                sources = []
            else:
                if cache_enabled:
                    dbcur.execute(
                        "DELETE FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % (
                            scraper.name, clean_title(title).upper(), show_year, year, season, episode))
                    dbcur.execute("INSERT INTO rel_src Values (?, ?, ?, ?, ?, ?, ?, ?, ?)", (
                        scraper.name, clean_title(title).upper(), show_year, year, season, episode, imdb,
                        json.dumps(sources),
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M")))
                    dbcon.commit()

            return sources
        except:
            pass

コード例 #13

0

ファイルを表示

ファイル: moviexk.py プロジェクト: 24061993/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb,
                       tvdb):
        try:
            print("MOVIEXK")
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(title))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            ep_id = int(episode)
            season_id = int(season)
            html = BeautifulSoup(
                requests.get(query, headers=headers, timeout=30).content)

            containers = html.findAll('div', attrs={'class': 'inner'})
            for container in containers:
                print("MOVIEXK r1", container)
                show_link = container.findAll('a')[0]
                r_href = show_link['href']
                print("MOVIEXK r2", r_href)
                r_title = show_link['title']
                print("MOVIEXK r3", r_title)
                print("MOVIEXK r4", r_title, r_href)
                if cleaned_title in clean_title(
                        r_title) and "tv" in r_title.lower():
                    redirect = requests.get(r_href,
                                            headers=headers,
                                            timeout=30).text
                    r_url = re.findall('<a href="(.*?)" class="btn-watch"',
                                       redirect)[0]
                    r_url = r_url.encode('utf-8')
                    links = BeautifulSoup(
                        requests.get(r_url, headers=headers,
                                     timeout=30).content)
                    ep_items = links.findAll('ul',
                                             attrs={'class': 'episodelist'})
                    for items in ep_items:
                        ep_links = items.findAll('a')
                        for r in ep_links:
                            print("MOVIEXK r5", r)
                            ep_url = r['href'].encode('utf-8')
                            ep_title = r['title'].encode('utf-8')
                            print("MOVIEXK r6", ep_url, ep_title)
                            clean_ep_title = clean_title(ep_title)
                            if "s%02de%02d" % (
                                    season_id,
                                    ep_id) in clean_ep_title or "s%02d%02d" % (
                                        season_id, ep_id
                                    ) in clean_ep_title or "s%02d%d" % (
                                        season_id, ep_id
                                    ) in clean_ep_title or "epse%d%d" % (
                                        season_id, ep_id) in clean_ep_title:
                                return self.sources(replaceHTMLCodes(ep_url))
        except:
            pass
        return []

コード例 #14

0

ファイルを表示

ファイル: hl.py プロジェクト: amadu80/repository.xvbmc

    def get_muscic_url(scraper, title, artist, cache_location, maximum_age, debrid = False):
        cache_enabled = xbmcaddon.Addon('script.module.nanscrapers').getSetting("cache_enabled") == 'true'
        try:
            dbcon = database.connect(cache_location)
            dbcur = dbcon.cursor()

            try:
                dbcur.execute("SELECT * FROM version")
                match = dbcur.fetchone()
            except:
                nanscrapers.clear_cache()
                dbcur.execute("CREATE TABLE version (""version TEXT)")
                dbcur.execute("INSERT INTO version Values ('0.5.4')")
                dbcon.commit()

            dbcur.execute(
                "CREATE TABLE IF NOT EXISTS rel_music_src (""scraper TEXT, ""title Text, ""artist TEXT, ""urls TEXT, ""added TEXT, ""UNIQUE(scraper, title, artist)"");")
        except:
            pass

        if cache_enabled:
            try:
                sources = []
                dbcur.execute(
                    "SELECT * FROM rel_music_src WHERE scraper = '%s' AND title = '%s' AND artist = '%s'" % (
                        scraper.name, clean_title(title).upper(), artist.upper()))
                match = dbcur.fetchone()
                t1 = int(re.sub('[^0-9]', '', str(match[4])))
                t2 = int(datetime.datetime.now().strftime("%Y%m%d%H%M"))
                update = abs(t2 - t1) > maximum_age
                if update == False:
                    sources = json.loads(match[3])
                    return sources
            except:
                pass

        try:
            sources = scraper.scrape_music(title, artist, debrid = debrid)
            if sources == None:
                sources = []
            else:
                if cache_enabled:
                    dbcur.execute(
                        "DELETE FROM rel_music_src WHERE scraper = '%s' AND title = '%s' AND artist = '%s'" % (
                            scraper.name, clean_title(title).upper(), artist.upper))
                    dbcur.execute("INSERT INTO rel_music_src Values (?, ?, ?, ?, ?)", (
                        scraper.name, clean_title(title).upper(), artist.upper(), json.dumps(sources),
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M")))
                    dbcon.commit()

            return sources
        except:
            pass

コード例 #15

0

ファイルを表示

ファイル: hl.py プロジェクト: smoke61/modules4all

    def get_muscic_url(scraper, title, artist, cache_location, maximum_age, debrid = False):
        cache_enabled = xbmcaddon.Addon('script.module.nanscrapers').getSetting("cache_enabled") == 'true'
        try:
            dbcon = database.connect(cache_location)
            dbcur = dbcon.cursor()

            try:
                dbcur.execute("SELECT * FROM version")
                match = dbcur.fetchone()
            except:
                nanscrapers.clear_cache()
                dbcur.execute("CREATE TABLE version (""version TEXT)")
                dbcur.execute("INSERT INTO version Values ('0.5.4')")
                dbcon.commit()

            dbcur.execute(
                "CREATE TABLE IF NOT EXISTS rel_music_src (""scraper TEXT, ""title Text, ""artist TEXT, ""urls TEXT, ""added TEXT, ""UNIQUE(scraper, title, artist)"");")
        except:
            pass

        if cache_enabled:
            try:
                sources = []
                dbcur.execute(
                    "SELECT * FROM rel_music_src WHERE scraper = '%s' AND title = '%s' AND artist = '%s'" % (
                        scraper.name, clean_title(title).upper(), artist.upper()))
                match = dbcur.fetchone()
                t1 = int(re.sub('[^0-9]', '', str(match[4])))
                t2 = int(datetime.datetime.now().strftime("%Y%m%d%H%M"))
                update = abs(t2 - t1) > maximum_age
                if update == False:
                    sources = json.loads(match[3])
                    return sources
            except:
                pass

        try:
            sources = scraper.scrape_music(title, artist, debrid = debrid)
            if sources == None:
                sources = []
            else:
                if cache_enabled:
                    dbcur.execute(
                        "DELETE FROM rel_music_src WHERE scraper = '%s' AND title = '%s' AND artist = '%s'" % (
                            scraper.name, clean_title(title).upper(), artist.upper))
                    dbcur.execute("INSERT INTO rel_music_src Values (?, ?, ?, ?, ?)", (
                        scraper.name, clean_title(title).upper(), artist.upper(), json.dumps(sources),
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M")))
                    dbcon.commit()

            return sources
        except:
            pass

コード例 #16

0

ファイルを表示

ファイル: primewire.py プロジェクト: noobsandnerds/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb):
        try:
            html = BeautifulSoup(self.get_html(title, self.tvsearch_link))
            index_items = html.findAll('div', attrs={'class': re.compile('index_item.+?')})
            title = 'watch' + clean_title(title).replace(": ", "")

            for index_item in index_items:
                try:
                    links = index_item.findAll('a')
                    for link in links:
                        href = link['href']
                        link_title = link['title']
                        try:
                            href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0]
                        except:
                            pass
                        try:
                            href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0]
                        except:
                            pass

                        if title == clean_title(link_title):  # href is the show page relative url
                            show_url = urlparse.urljoin(self.base_link, href)
                            html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item'))

                            seasons = html.findAll('div', attrs={'class': 'show_season'})
                            for scraped_season in seasons:
                                if scraped_season['data-id'] == season:
                                    tv_episode_items = scraped_season.findAll('div', attrs={'class': 'tv_episode_item'})
                                    for tv_episode_item in tv_episode_items:
                                        links = tv_episode_item.findAll('a')
                                        for link in links:
                                            if link.contents[0].strip() == "E%s" % episode:
                                                episode_href = link['href']
                                                try:
                                                    episode_href = \
                                                        urlparse.parse_qs(urlparse.urlparse(episode_href).query)['u'][0]
                                                except:
                                                    pass
                                                try:
                                                    episode_href = \
                                                        urlparse.parse_qs(urlparse.urlparse(episode_href).query)['q'][0]
                                                except:
                                                    pass
                                                return self.sources(episode_href)
                except:
                    continue
        except:
            pass
        return []

コード例 #17

0

ファイルを表示

ファイル: freemusic.py プロジェクト: gtmnyc/repository.SkyQ

    def scrape_music(self, title, artist, debrid=False):
        try:
            song_search = clean_title(title.lower()).replace(' ', '+')
            artist_search = clean_title(artist.lower()).replace(' ', '+')
            song_comp = clean_title(title.lower())
            artist_comp = clean_title(artist.lower())
            total = artist_comp + '-' + song_comp
            start_url = '%sresults?search=%s+%s' % (self.base_link,
                                                    artist_search, song_search)
            html = requests.get(start_url, headers=headers, timeout=20).content
            match = re.compile(
                '<h4 class="card-title">(.+?)</h4>.+?href="(.+?)"',
                re.DOTALL).findall(html)
            for m, link in match:
                match2 = m.replace('\n', '').replace('\t', '').replace(' ', '')
                match3 = match2.lower()
                quals = re.compile(str(total) +
                                   '(.+?)>').findall(str(match3) + '>')
                qual1 = str(quals)
                qual = qual1.replace("[", "").replace("]", "")
                if clean_title(title).lower() in clean_title(match2).lower():
                    if clean_title(artist).lower() in clean_title(
                            match2).lower():
                        self.sources.append({
                            'source': 'Youtube',
                            'quality': qual,
                            'scraper': self.name,
                            'url': link,
                            'direct': True
                        })

            return self.sources
        except Exception, argument:
            return self.sources

コード例 #18

0

ファイルを表示

ファイル: watchfree.py プロジェクト: 24061993/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            query = self.moviesearch_link % urllib.quote_plus(
                title.replace('\'', '').rsplit(':', 1)[0])
            query = urlparse.urljoin(self.base_link, query)

            html = proxy.get(query, 'item')
            if 'page=2' in html or 'page%3D2' in html:
                html2 = proxy.get(query + '&page=2', 'item')
                html += html2

            html = BeautifulSoup(html)

            cleaned_title = 'watchputlocker' + clean_title(title)
            years = [
                '(%s)' % str(year),
                '(%s)' % str(int(year) + 1),
                '(%s)' % str(int(year) - 1)
            ]

            items = html.findAll('div', attrs={'class': 'item'})

            for item in items:
                links = item.findAll('a')
                for link in links:
                    href = link['href']
                    link_title = link['title']
                    if any(candidate_year in link_title
                           for candidate_year in years):
                        try:
                            href = urlparse.parse_qs(
                                urlparse.urlparse(href).query)['u'][0]
                        except:
                            pass
                        try:
                            href = urlparse.parse_qs(
                                urlparse.urlparse(href).query)['q'][0]
                        except:
                            pass

                        if cleaned_title == clean_title(link_title):
                            url = re.findall('(?://.+?|)(/.+)', href)[0]
                            url = replaceHTMLCodes(url)
                            return self.sources(url)
        except:
            pass
        return []

コード例 #19

0

ファイルを表示

 def scrape_episode(self, title, show_year, year, season, episode, imdb,
                    tvdb):
     try:
         headers = {'User-Agent': random_agent()}
         query = self.search_link % (urllib.quote_plus(title))
         query = urlparse.urljoin(self.base_link, query)
         cleaned_title = clean_title(title)
         ep_id = int(episode)
         season_id = int(season)
         html = requests.get(query, headers=headers, timeout=30).json()
         results = html['series']
         for item in results:
             r_title = item['label'].encode('utf-8')
             r_link = item['seo'].encode('utf-8')
             if cleaned_title == clean_title(r_title):
                 r_page = self.base_link + "/" + r_link
                 # print("WATCHEPISODES r1", r_title,r_page)
                 r_html = BeautifulSoup(
                     requests.get(r_page, headers=headers,
                                  timeout=30).content)
                 r = r_html.findAll(
                     'div', attrs={'class': re.compile('\s*el-item\s*')})
                 for container in r:
                     try:
                         r_href = container.findAll('a')[0]['href'].encode(
                             'utf-8')
                         r_title = container.findAll(
                             'a')[0]['title'].encode('utf-8')
                         # print("WATCHEPISODES r3", r_href,r_title)
                         episode_check = "[sS]%02d[eE]%02d" % (int(season),
                                                               int(episode))
                         match = re.search(episode_check, r_title)
                         if match:
                             # print("WATCHEPISODES PASSED EPISODE", r_href)
                             return self.sources(replaceHTMLCodes(r_href))
                         else:
                             match2 = re.search(episode_check, r_href)
                             if match2:
                                 # print("WATCHEPISODES PASSED EPISODE", r_href)
                                 return self.sources(
                                     replaceHTMLCodes(r_href))
                     except:
                         pass
     except:
         pass
     return []

コード例 #20

0

ファイルを表示

ファイル: primewire.py プロジェクト: 24061993/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            html = BeautifulSoup(self.get_html(title, self.moviesearch_link))
            index_items = html.findAll(
                'div', attrs={'class': 'index_item index_item_ie'})
            title = 'watch' + clean_title(title).replace(": ", "").replace(
                "'", "")
            years = [
                '(%s)' % str(year),
                '(%s)' % str(int(year) + 1),
                '(%s)' % str(int(year) - 1)
            ]
            fallback = None

            for index_item in index_items:
                try:
                    links = index_item.findAll('a')
                    for link in links:
                        href = link['href']
                        link_title = link['title']

                        if any(x in link_title
                               for x in years) or not "(" in link_title:
                            try:
                                href = urlparse.parse_qs(
                                    urlparse.urlparse(href).query)['u'][0]
                            except:
                                pass
                            try:
                                href = urlparse.parse_qs(
                                    urlparse.urlparse(href).query)['q'][0]
                            except:
                                pass
                            if title.lower() == clean_title(link_title):
                                if '(%s)' % str(year) in link_title:
                                    return self.sources(href)
                                else:
                                    fallback = href

                except:
                    continue
            if fallback:
                return self.sources(fallback)
        except:
            pass
        return []

コード例 #21

0

ファイルを表示

ファイル: watch5s_mv_tv-edit.py プロジェクト: 24061993/noobsandnerds

 def tvshow(self, url, title, season, episode):
     try:
         self.url = []
         data = urlparse.parse_qs(url)
         data = dict([(i, data[i][0]) if data[i] else (i, '')
                      for i in data])
         title = data['tvshowtitle'] if 'tvshowtitle' in data else data[
             'title']
         year = data['year']
         cleanmovie = clean_title(title)
         data['season'], data['episode'] = season, episode
         seasoncheck = "season%s" % season
         episode = "%01d" % int(episode)
         checktitle = cleanmovie + seasoncheck
         seasonquery = "season+%s" % season
         query = self.search_link % (urllib.quote_plus(title), seasonquery)
         query = urlparse.urljoin(self.base_link, query)
         link = BeautifulSoup(requests.get(query).text)
         r = link.findAll('div', attrs={'class': 'ml-item'})
         for links in r:
             page_links = links.findAll('a')[0]
             pageurl = page_links['href']
             info = page_links['rel']
             title = page_links['title']
             info = info.encode('utf-8')
             title = title.encode('utf-8')
             if checktitle == clean_title(title):
                 # print("CMOVIES LINKS", pageurl,info,title)
                 pageurl = pageurl.encode('utf-8')
                 ep_url = pageurl + 'watch/'
                 referer = ep_url
                 ep_links = BeautifulSoup(requests.get(ep_url).text)
                 r_ep = ep_links.findAll('div',
                                         attrs={'class': 'les-content'})
                 for item in r_ep:
                     match = re.compile(
                         '<a href="(.*?)" class=.*?">Episode\s*(\d+)'
                     ).findall(item.contents)
                     for href, ep_items in match:
                         ep_items = '%01d' % int(ep_items)
                         if ep_items == episode:
                             self.url.append([href, referer])
         self.Sources(self.url)
     except:
         return

コード例 #22

0

ファイルを表示

ファイル: watch5s_mv_tv-edit.py プロジェクト: 24061993/noobsandnerds

 def scrape_movie(self, imdb, title, year):
     try:
         self.url = []
         title = getsearch(title)
         cleanmovie = clean_title(title)
         query = self.search_link % (urllib.quote_plus(title), year)
         query = urlparse.urljoin(self.base_link, query)
         link = requests.get(query).text
         html = BeautifulSoup(link)
         r = html.findAll('div', attrs={'class': 'ml-item'})
         for links in r:
             page_links = links.findAll('a')[0]
             pageurl = page_links['href']
             info = page_links['rel']
             title = page_links['title']
             info = info.encode('utf-8')
             title = title.encode('utf-8')
             # print("CMOVIES LINKS", pageurl,info,title)
             if cleanmovie in clean_title(title):
                 infolink = requests.get(info).text
                 match_year = re.search('class="jt-info">(\d{4})<',
                                        infolink)
                 match_year = match_year.group(1)
                 # print("CMOVIES YEAR",match_year)
                 if year in match_year:
                     # print("CMOVIES PASSED")
                     pageurl = pageurl.encode('utf-8')
                     url = pageurl + 'watch/'
                     referer = url
                     # print("CMOVIES PASSED",referer,url)
                     link = BeautifulSoup(requests.get(url).text)
                     r = link.findAll('div', attrs={'class': 'les-content'})
                     for item in r:
                         try:
                             vidlinks = item.findAll('a')[0]['href']
                             vidlinks = vidlinks.encode('utf-8')
                             # print('CMOVIES SERVER LINKS',vidlinks)
                             self.url.append([vidlinks, referer])
                         except:
                             pass
         #print("CMOVIES PASSED LINKS", self.url)
         self.Sources(self.url)
     except:
         return self.url

コード例 #23

0

ファイルを表示

ファイル: moviexk.py プロジェクト: noobsandnerds/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb):
        try:
            print("MOVIEXK")
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(title))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            ep_id = int(episode)
            season_id = int(season)
            html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)

            containers = html.findAll('div', attrs={'class': 'inner'})
            for container in containers:
                print("MOVIEXK r1", container)
                show_link = container.findAll('a')[0]
                r_href = show_link['href']
                print("MOVIEXK r2", r_href)
                r_title = show_link['title']
                print("MOVIEXK r3", r_title)
                print("MOVIEXK r4", r_title, r_href)
                if cleaned_title in clean_title(r_title) and "tv" in r_title.lower():
                    redirect = requests.get(r_href, headers=headers, timeout=30).text
                    r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0]
                    r_url = r_url.encode('utf-8')
                    links = BeautifulSoup(requests.get(r_url, headers=headers, timeout=30).content)
                    ep_items = links.findAll('ul', attrs={'class': 'episodelist'})
                    for items in ep_items:
                        ep_links = items.findAll('a')
                        for r in ep_links:
                            print("MOVIEXK r5", r)
                            ep_url = r['href'].encode('utf-8')
                            ep_title = r['title'].encode('utf-8')
                            print("MOVIEXK r6", ep_url, ep_title)
                            clean_ep_title = clean_title(ep_title)
                            if "s%02de%02d" % (season_id, ep_id) in clean_ep_title or "s%02d%02d" % (
                                    season_id, ep_id) in clean_ep_title or "s%02d%d" % (
                                    season_id, ep_id) in clean_ep_title or "epse%d%d" % (season_id, ep_id) in clean_ep_title :
                                return self.sources(replaceHTMLCodes(ep_url))
        except:
            pass
        return []

コード例 #24

0

ファイルを表示

ファイル: freemusic.py プロジェクト: amadu80/repository.xvbmc

    def scrape_music(self, title, artist, debrid=False):
        try:
            song_search = clean_title(title.lower()).replace(' ','+')
            artist_search = clean_title(artist.lower()).replace(' ','+')
            start_url = '%sresults?search_query=%s+%s'    %(self.base_link,artist_search,song_search)
            html = requests.get(start_url, headers=headers, timeout=20).content
            match = re.compile('<h4 class="card-title">.+?</i>(.+?)</h4>.+?id="(.+?)"',re.DOTALL).findall(html)
            count = 0
            for m, link in match:
                match4 = m.replace('\n','').replace('\t','').replace('  ',' ').replace('   ',' ').replace('    ',' ').replace('     ',' ')
                match5 = re.sub('&#(\d+);', '', match4)
                match5 = re.sub('(&#[0-9]+)([^;^0-9]+)', '\\1;\\2', match5)
                match5 = match5.replace('&quot;', '\"').replace('&amp;', '&')
                match5 = re.sub('\\\|/|\(|\)|\[|\]|\{|\}|-|:|;|\*|\?|"|\'|<|>|\_|\.|\?', ' ', match5)
                match5 = ' '.join(match5.split())
                match2 = m.replace('\n','').replace('\t','').replace(' ','')
                if clean_title(title).lower() in clean_title(match2).lower():
                    if clean_title(artist).lower() in clean_title(match2).lower():
                        final_link = 'https://www.youtube.com/watch?v='+link
                        count +=1
                        self.sources.append({'source':self.name, 'quality':'SD', 'scraper':match5, 'url':final_link, 'direct': False})
            if dev_log=='true':
                end_time = time.time() - self.start_time
                send_log(self.name,end_time,count)             

            return self.sources    
        except Exception, argument:
            return self.sources

コード例 #25

0

ファイルを表示

ファイル: freemusic.py プロジェクト: scooters5670/modules4all

    def scrape_music(self, title, artist, debrid=False):
        try:
            song_search = clean_title(title.lower()).replace(' ','+')
            artist_search = clean_title(artist.lower()).replace(' ','+')
            start_url = '%sresults?search_query=%s+%s'    %(self.base_link,artist_search,song_search)
            html = requests.get(start_url, headers=headers, timeout=20).content
            match = re.compile('<h4 class="card-title">(.+?)</h4>.+?id="(.+?)"',re.DOTALL).findall(html)
            count = 0
            for m, link in match:
                match4 = m.replace('\n','').replace('\t','').replace('  ',' ').replace('   ',' ').replace('    ',' ').replace('     ',' ')
                match5 = re.sub('&#(\d+);', '', match4)
                match5 = re.sub('(&#[0-9]+)([^;^0-9]+)', '\\1;\\2', match5)
                match5 = match5.replace('&quot;', '\"').replace('&amp;', '&')
                match5 = re.sub('\\\|/|\(|\)|\[|\]|\{|\}|-|:|;|\*|\?|"|\'|<|>|\_|\.|\?', ' ', match5)
                match5 = ' '.join(match5.split())
                match2 = m.replace('\n','').replace('\t','').replace(' ','')
                if clean_title(title).lower() in clean_title(match2).lower():
                    if clean_title(artist).lower() in clean_title(match2).lower():
                        final_link = 'https://www.youtube.com/watch?v='+link
                        count +=1
                        self.sources.append({'source':self.name, 'quality':'SD', 'scraper':match5, 'url':final_link, 'direct': False})
            if dev_log=='true':
                end_time = time.time() - self.start_time
                send_log(self.name,end_time,count)             

            return self.sources    
        except Exception, argument:
            return self.sources

コード例 #26

0

ファイルを表示

ファイル: mfree.py プロジェクト: 24061993/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            headers = {'User-Agent': random_agent()}
            q = (title.translate(None,
                                 '\/:*?"\'<>|!,')).replace(' ', '-').replace(
                                     '--', '-').lower()
            query = urlparse.urljoin(self.base_link,
                                     self.movie_search_link % q)
            cleaned_title = clean_title(title)
            html = requests.get(query, headers=headers, timeout=30).content
            containers = re.compile(
                '<a class="top-item".*href="(.*?)"><cite>(.*?)</cite></a>'
            ).findall(html)
            for href, title in containers:
                parsed = re.findall('(.+?) \((\d{4})', title)
                parsed_title = parsed[0][0]
                parsed_years = parsed[0][1]
                if cleaned_title == clean_title(
                        parsed_title) and year == parsed_years:
                    try:
                        headers = {'User-Agent': random_agent()}
                        html = requests.get(href, headers=headers,
                                            timeout=30).content
                        parsed_html = BeautifulSoup(html)
                        quality_title = parsed_html.findAll(
                            "h3", attrs={'title':
                                         re.compile("Quality of ")})[0]
                        quality = quality_title.findAll('span')[0].text
                        match = re.search('href="([^"]+-full-movie-[^"]+)',
                                          html)
                        if match:
                            url = match.group(1)
                            return self.sources(url, "SD")
                    except:
                        pass

        except:
            pass
        return []

コード例 #27

0

ファイルを表示

ファイル: watchfree.py プロジェクト: noobsandnerds/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            query = self.moviesearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0])
            query = urlparse.urljoin(self.base_link, query)

            html = proxy.get(query, 'item')
            if 'page=2' in html or 'page%3D2' in html:
                html2 = proxy.get(query + '&page=2', 'item')
                html += html2

            html = BeautifulSoup(html)

            cleaned_title = 'watchputlocker' + clean_title(title)
            years = ['(%s)' % str(year), '(%s)' % str(int(year) + 1), '(%s)' % str(int(year) - 1)]

            items = html.findAll('div', attrs={'class': 'item'})

            for item in items:
                links = item.findAll('a')
                for link in links:
                    href = link['href']
                    link_title = link['title']
                    if any(candidate_year in link_title for candidate_year in years):
                        try:
                            href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0]
                        except:
                            pass
                        try:
                            href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0]
                        except:
                            pass

                        if cleaned_title == clean_title(link_title):
                            url = re.findall('(?://.+?|)(/.+)', href)[0]
                            url = replaceHTMLCodes(url)
                            return self.sources(url)
        except:
            pass
        return []

コード例 #28

0

ファイルを表示

 def scrape_movie(self, title, year, imdb):
     try:
         headers = {'User-Agent': random_agent()}
         query = self.search_link % (urllib.quote_plus(title))
         query = urlparse.urljoin(self.base_link, query)
         cleaned_title = clean_title(title)
         html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)
         containers = html.findAll('div', attrs={'class': 'cell_container'})
         for container in containers:
             links = container.findAll('a')
             for link in links:
                 link_title = link['title']
                 href = link['href']
                 if len(link_title) > 0 and len(href) > 0:
                     parsed = re.findall('(.+?) \((\d{4})', link_title)
                     parsed_title = parsed[0][0]
                     parsed_years = parsed[0][1]
                     if cleaned_title == clean_title(parsed_title) and year == parsed_years:
                         return self.sources(replaceHTMLCodes(href))
     except:
         pass
     return []

コード例 #29

0

ファイルを表示

ファイル: watchepisodes.py プロジェクト: noobsandnerds/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb):
        try:
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(title))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            ep_id = int(episode)
            season_id = int(season)
            html = requests.get(query, headers=headers, timeout=30).json()
            results = html['series']
            for item in results:
				r_title = item['label'].encode('utf-8')
				r_link = item['seo'].encode('utf-8')
				if cleaned_title == clean_title(r_title):
					r_page = self.base_link + "/" + r_link
					# print("WATCHEPISODES r1", r_title,r_page)
					r_html = BeautifulSoup(requests.get(r_page, headers=headers, timeout=30).content)
					r = r_html.findAll('div', attrs={'class': re.compile('\s*el-item\s*')})
					for container in r:
						try:
							r_href = container.findAll('a')[0]['href'].encode('utf-8')
							r_title = container.findAll('a')[0]['title'].encode('utf-8')
							# print("WATCHEPISODES r3", r_href,r_title)
							episode_check = "[sS]%02d[eE]%02d" % (int(season), int(episode))
							match = re.search(episode_check, r_title)
							if match:
								# print("WATCHEPISODES PASSED EPISODE", r_href)
								return self.sources(replaceHTMLCodes(r_href))
							else:
								match2 = re.search(episode_check, r_href)
								if match2:
									# print("WATCHEPISODES PASSED EPISODE", r_href)
									return self.sources(replaceHTMLCodes(r_href))									
						except:
							pass
        except:
            pass
        return []

コード例 #30

0

ファイルを表示

ファイル: onemusic.py プロジェクト: ledtvavs/repository.ledtv

    def scrape_music(self, title, artist, debrid=False):
        try:
            # print("ONEMUSIC")
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(
                title.replace("'", "")))
            query = urlparse.urljoin(self.base_link, query)
            # print("ONEMUSIC", query)
            artist_name = clean_title(artist)
            song_name = clean_title(title)
            # print("ONEMUSIC ARTIST", artist_name)
            html = BeautifulSoup(
                requests.get(query, headers=headers, timeout=30).content)
            self.musiclist = []
            containers = html.findAll('div', attrs={'class': 'sr-songs-list'})
            for blocks in containers:
                song_block = blocks.findAll('div',
                                            attrs={'class': 'item-caption'})
                for item in song_block:
                    href = item.findAll('a')[0]['href']
                    song_title = item.findAll('a')[0]['title']
                    href = href.encode('utf-8')
                    song_title = song_title.encode('utf-8')
                    if clean_title(song_title) == song_name:
                        artist_block = item.findAll('span',
                                                    attrs={'class':
                                                           'singer'})[0]
                        artist = artist_block.findAll('a')[0]['title']
                        artist = artist.encode('utf-8')
                        artist = clean_title(artist)
                        print("ONEMUSIC", href, song_title, artist_name)
                        if artist == artist_name:
                            print("ONEMUSIC PASSED", href, song_title, artist)
                            return self.sources(href, "HD")

        except:
            pass
        return []

コード例 #31

0

ファイルを表示

ファイル: primewire.py プロジェクト: noobsandnerds/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            html = BeautifulSoup(self.get_html(title, self.moviesearch_link))
            index_items = html.findAll('div', attrs={'class': 'index_item index_item_ie'})
            title = 'watch' + clean_title(title).replace(": ", "").replace("'", "")
            years = ['(%s)' % str(year), '(%s)' % str(int(year) + 1), '(%s)' % str(int(year) - 1)]
            fallback = None

            for index_item in index_items:
                try:
                    links = index_item.findAll('a')
                    for link in links:
                        href = link['href']
                        link_title = link['title']

                        if any(x in link_title for x in years) or not "(" in link_title:
                            try:
                                href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0]
                            except:
                                pass
                            try:
                                href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0]
                            except:
                                pass
                            if title.lower() == clean_title(link_title):
                                if '(%s)' % str(year) in link_title:
                                    return self.sources(href)
                                else:
                                    fallback = href

                except:
                    continue
            if fallback:
                return self.sources(fallback)
        except:
            pass
        return []

コード例 #32

0

ファイルを表示

ファイル: moviexk.py プロジェクト: 24061993/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            print("MOVIEXK")
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(title) + "+" +
                                        str(year))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            html = BeautifulSoup(
                requests.get(query, headers=headers, timeout=30).content)

            containers = html.findAll('div', attrs={'class': 'inner'})
            for container in containers:
                print("MOVIEXK r1", container)
                movie_link = container.findAll('a')[0]
                r_href = movie_link['href']
                print("MOVIEXK r2", r_href)
                r_title = movie_link['title']
                link_year = container.findAll('span',
                                              attrs={'class': 'year'
                                                     })[0].findAll('a')[0].text
                print("MOVIEXK r3", r_title)
                print("MOVIEXK RESULTS", r_title, r_href)
                if str(year) == link_year:
                    if cleaned_title in clean_title(r_title):
                        redirect = requests.get(r_href,
                                                headers=headers,
                                                timeout=30).text
                        r_url = re.findall('<a href="(.*?)" class="btn-watch"',
                                           redirect)[0]
                        r_url = r_url.encode('utf-8')
                        print("MOVIEXK PLAY URL", r_url)
                        return self.sources(replaceHTMLCodes(r_url))
        except:
            pass
        return []

コード例 #33

0

ファイルを表示

    def scrape_movie(self, title, year, imdb):
        try:
            # print("MOVIEGO INIT")
            headers = {'User-Agent': random_agent()}
            searchquery = self.search_link % (urllib.quote_plus(title), year)
            query = urlparse.urljoin(self.base_link, searchquery)
            cleaned_title = clean_title(title)
            html = requests.get(query, headers=headers).content
            html = BeautifulSoup(html)

            containers = html.findAll('div', attrs={'class': 'short_content'})
            # print("MOVIEGO MOVIES",containers)
            for items in containers:
                href = items.findAll('a')[0]['href']
                title = items.findAll('div', attrs={'class': 'short_header'})[0]
                if year in str(title):
                    title = normalize(str(title))
                    if title == cleaned_title:
                        return self.sources(replaceHTMLCodes(href))

        except:
            return []

コード例 #34

0

ファイルを表示

ファイル: hl.py プロジェクト: amadu80/repository.xvbmc

    def get_url(scraper, title, show_year, year, season, episode, imdb, tvdb, type, cache_location, maximum_age, check_url = False, debrid = False):
        cache_enabled = xbmcaddon.Addon('script.module.nanscrapers').getSetting("cache_enabled") == 'true'
        try:
            dbcon = database.connect(cache_location)
            dbcur = dbcon.cursor()
            try:
                dbcur.execute("SELECT * FROM version")
                match = dbcur.fetchone()
            except:
                nanscrapers.clear_cache()
                dbcur.execute("CREATE TABLE version (""version TEXT)")
                dbcur.execute("INSERT INTO version Values ('0.5.4')")
                dbcon.commit()

            dbcur.execute(
                "CREATE TABLE IF NOT EXISTS rel_src (""scraper TEXT, ""title Text, show_year TEXT, year TEXT, ""season TEXT, ""episode TEXT, ""imdb_id TEXT, ""urls TEXT, ""added TEXT, ""UNIQUE(scraper, title, year, season, episode)"");")
        except:
            pass

        if cache_enabled:
            try:
                sources = []
                dbcur.execute(
                    "SELECT * FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % (
                        scraper.name, clean_title(title).upper(), show_year, year, season, episode))
                match = dbcur.fetchone()
                t1 = int(re.sub('[^0-9]', '', str(match[8])))
                t2 = int(datetime.datetime.now().strftime("%Y%m%d%H%M"))
                update = abs(t2 - t1) > maximum_age
                if update == False:
                    sources = json.loads(match[7])
                    return sources
            except:
                pass

        try:
            sources = []
            if type == "movie":
                sources = scraper.scrape_movie(title, year, imdb, debrid = debrid)
            elif type == "episode":
                sources = scraper.scrape_episode(title, show_year, year, season, episode, imdb, tvdb, debrid = debrid)
            if sources == None:
                sources = []
            else:
                if cache_enabled:
                    try:
                        dbcur.execute(
                            "DELETE FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'" % (
                                scraper.name, clean_title(title).upper(), show_year, year, season, episode))
                        dbcur.execute("INSERT INTO rel_src Values (?, ?, ?, ?, ?, ?, ?, ?, ?)", (
                            scraper.name, clean_title(title).upper(), show_year, year, season, episode, imdb,
                            json.dumps(sources),
                            datetime.datetime.now().strftime("%Y-%m-%d %H:%M")))
                        dbcon.commit()
                    except:
                        pass

            if check_url:
                noresolver = False
                try:
                    import resolveurl as urlresolver
                except:
                    try:
                        import urlresolver as urlresolver
                    except:
                        noresolver = True
                new_sources = []
                from common import check_playable
                for source in sources:
                    if source["direct"]:
                        check = check_playable(source["url"])
                        if check:
                            new_sources.append(source)
                    elif not noresolver:
                        try:
                            hmf = urlresolver.HostedMediaFile(url=source['url'], include_disabled=False,
                                                         include_universal=False)
                            if hmf.valid_url():
                                resolved_url = hmf.resolve()
                                check = check_playable(resolved_url)
                                if check:
                                    new_sources.append(source)
                        except:
                            pass
                    else:
                        new_sources.append(source)
                sources = new_sources
            return sources
        except:
            pass

コード例 #35

0

ファイルを表示

ファイル: hl.py プロジェクト: nusch/modules4all

    def get_url(scraper,
                title,
                show_year,
                year,
                season,
                episode,
                imdb,
                tvdb,
                type,
                cache_location,
                maximum_age,
                check_url=False,
                debrid=False):
        cache_enabled = xbmcaddon.Addon(
            'script.module.nanscrapers').getSetting("cache_enabled") == 'true'
        try:
            dbcon = database.connect(cache_location)
            dbcur = dbcon.cursor()
            try:
                dbcur.execute("SELECT * FROM version")
                match = dbcur.fetchone()
            except:
                nanscrapers.clear_cache()
                dbcur.execute("CREATE TABLE version (" "version TEXT)")
                dbcur.execute("INSERT INTO version Values ('0.5.4')")
                dbcon.commit()

            dbcur.execute("CREATE TABLE IF NOT EXISTS rel_src ("
                          "scraper TEXT, "
                          "title Text, show_year TEXT, year TEXT, "
                          "season TEXT, "
                          "episode TEXT, "
                          "imdb_id TEXT, "
                          "urls TEXT, "
                          "added TEXT, "
                          "UNIQUE(scraper, title, year, season, episode)"
                          ");")
        except:
            pass

        if cache_enabled:
            try:
                sources = []
                dbcur.execute(
                    "SELECT * FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'"
                    % (scraper.name, clean_title(title).upper(), show_year,
                       year, season, episode))
                match = dbcur.fetchone()
                t1 = int(re.sub('[^0-9]', '', str(match[8])))
                t2 = int(datetime.datetime.now().strftime("%Y%m%d%H%M"))
                update = abs(t2 - t1) > maximum_age
                if update == False:
                    sources = json.loads(match[7])
                    return sources
            except:
                pass

        try:
            sources = []
            if type == "movie":
                sources = scraper.scrape_movie(title,
                                               year,
                                               imdb,
                                               debrid=debrid)
            elif type == "episode":
                sources = scraper.scrape_episode(title,
                                                 show_year,
                                                 year,
                                                 season,
                                                 episode,
                                                 imdb,
                                                 tvdb,
                                                 debrid=debrid)
            if sources == None:
                sources = []
            else:
                if cache_enabled:
                    dbcur.execute(
                        "DELETE FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'"
                        % (scraper.name, clean_title(title).upper(), show_year,
                           year, season, episode))
                    dbcur.execute(
                        "INSERT INTO rel_src Values (?, ?, ?, ?, ?, ?, ?, ?, ?)",
                        (scraper.name, clean_title(title).upper(), show_year,
                         year, season, episode, imdb, json.dumps(sources),
                         datetime.datetime.now().strftime("%Y-%m-%d %H:%M")))
                    dbcon.commit()

            if check_url:
                noresolver = False
                try:
                    import urlresolver
                except:
                    try:
                        import urlresolver9 as urlresolver
                    except:
                        noresolver = True
                new_sources = []
                from common import check_playable
                for source in sources:
                    if source["direct"]:
                        check = check_playable(source["url"])
                        if check:
                            new_sources.append(source)
                    elif not noresolver:
                        try:
                            hmf = urlresolver.HostedMediaFile(
                                url=source['url'],
                                include_disabled=False,
                                include_universal=False)
                            if hmf.valid_url():
                                resolved_url = hmf.resolve()
                                check = check_playable(resolved_url)
                                if check:
                                    new_sources.append(source)
                        except:
                            pass
                    else:
                        new_sources.append(source)
                sources = new_sources
            return sources
        except:
            pass

コード例 #36

0

ファイルを表示

ファイル: watchfree.py プロジェクト: 24061993/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb,
                       tvdb):
        try:
            query = urlparse.urljoin(
                self.base_link, self.tvsearch_link %
                urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0]))

            html = proxy.get(query, 'item')
            if 'page=2' in html or 'page%3D2' in html:
                html2 = proxy.get(query + '&page=2', 'item')
                html += html2

            html = BeautifulSoup(html)

            cleaned_title = 'watchputlocker' + clean_title(title)
            years = [
                '%s' % str(year),
                '%s' % str(int(year) + 1),
                '%s' % str(int(year) - 1)
            ]

            items = html.findAll('div', attrs={'class': 'item'})

            show_url = None
            for item in items:
                links = item.findAll('a')
                for link in links:
                    href = link['href']
                    link_title = link['title']
                    try:
                        href = urlparse.parse_qs(
                            urlparse.urlparse(href).query)['u'][0]
                    except:
                        pass
                    try:
                        href = urlparse.parse_qs(
                            urlparse.urlparse(href).query)['q'][0]
                    except:
                        pass
                    if cleaned_title == clean_title(
                            link_title) and show_year in link_title:
                        url = re.findall('(?://.+?|)(/.+)', href)[0]
                        show_url = urlparse.urljoin(self.base_link,
                                                    replaceHTMLCodes(url))
                    else:
                        continue

                    html = BeautifulSoup(proxy.get(show_url,
                                                   'tv_episode_item'))
                    season_items = html.findAll('div',
                                                attrs={'class': 'show_season'})
                    for season_item in season_items:
                        if season_item["data-id"] != season:
                            continue
                        episode_items = season_item.findAll(
                            'div', attrs={'class': 'tv_episode_item'})
                        for episode_item in episode_items:
                            link = episode_item.findAll('a')[-1]
                            href = link["href"]
                            link_episode = link.contents[0].strip()
                            if link_episode != "E%s" % (episode):
                                continue
                            link_airdate = link.findAll(
                                'span', attrs={'class':
                                               'tv_num_versions'})[-1]  # WTF
                            link_airdate = link_airdate.contents[0]
                            if any(candidate_year in link_airdate
                                   for candidate_year in years):
                                return self.sources(href)

        except:
            pass
        return []

コード例 #37

0

ファイルを表示

ファイル: hl.py プロジェクト: bopopescu/theapprentice

    def get_url(scraper, title, show_year, year, season, episode, imdb, tvdb,
                type, cache_location, maximum_age):
        cache_enabled = xbmcaddon.Addon(
            'script.module.nanscrapers').getSetting("cache_enabled") == 'true'
        try:
            dbcon = database.connect(cache_location)
            dbcur = dbcon.cursor()
            try:
                dbcur.execute("SELECT * FROM version")
                match = dbcur.fetchone()
            except:
                nanscrapers.clear_cache()
                dbcur.execute("CREATE TABLE version (" "version TEXT)")
                dbcur.execute("INSERT INTO version Values ('0.5.4')")
                dbcon.commit()

            dbcur.execute("CREATE TABLE IF NOT EXISTS rel_src ("
                          "scraper TEXT, "
                          "title Text, show_year TEXT, year TEXT, "
                          "season TEXT, "
                          "episode TEXT, "
                          "imdb_id TEXT, "
                          "urls TEXT, "
                          "added TEXT, "
                          "UNIQUE(scraper, title, year, season, episode)"
                          ");")
        except:
            pass

        if cache_enabled:
            try:
                sources = []
                dbcur.execute(
                    "SELECT * FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'"
                    % (scraper.name, clean_title(title).upper(), show_year,
                       year, season, episode))
                match = dbcur.fetchone()
                t1 = int(re.sub('[^0-9]', '', str(match[8])))
                t2 = int(datetime.datetime.now().strftime("%Y%m%d%H%M"))
                update = abs(t2 - t1) > maximum_age
                if update == False:
                    sources = json.loads(match[7])
                    return sources
            except:
                pass

        try:
            sources = []
            if type == "movie":
                sources = scraper.scrape_movie(title, year, imdb)
            elif type == "episode":
                sources = scraper.scrape_episode(title, show_year, year,
                                                 season, episode, imdb, tvdb)
            if sources == None:
                sources = []
            else:
                if cache_enabled:
                    dbcur.execute(
                        "DELETE FROM rel_src WHERE scraper = '%s' AND title = '%s' AND show_year= '%s' AND year = '%s' AND season = '%s' AND episode = '%s'"
                        % (scraper.name, clean_title(title).upper(), show_year,
                           year, season, episode))
                    dbcur.execute(
                        "INSERT INTO rel_src Values (?, ?, ?, ?, ?, ?, ?, ?, ?)",
                        (scraper.name, clean_title(title).upper(), show_year,
                         year, season, episode, imdb, json.dumps(sources),
                         datetime.datetime.now().strftime("%Y-%m-%d %H:%M")))
                    dbcon.commit()

            return sources
        except:
            pass

コード例 #38

0

ファイルを表示

ファイル: default.py プロジェクト: amadu80/repository.xvbmc

def test():
    global movies, shows
    try:
        test_movies = []
        test_episodes = []
        profile_path = xbmc.translatePath(xbmcaddon.Addon().getAddonInfo('profile')).decode('utf-8')
        test_file = xbmcvfs.File(os.path.join(profile_path, "testings.xml"))
        xml = BeautifulStoneSoup(test_file.read())
        test_file.close()
        items = xml.findAll("item")
        for item in items:
            try:
                content = item.find("content")
                if content:
                    if "movie" in content.text:
                        meta = item.find("meta")
                        test_movies.append({
                            'title': meta.find("title").text,
                            'imdb': meta.find("imdb").text,
                            'year': meta.find("year").text,
                        })
                    elif "episode" in content.text:
                        meta = item.find("meta")
                        test_episodes.append({
                            'title': meta.find("tvshowtitle").text,
                            'show_year': int(meta.find("premiered").text[0:4]),
                            'year': meta.find("year").text,
                            'season': meta.find("season").text,
                            'episode': meta.find("season").text,
                            'imdb': meta.find("imdb").text,
                        })
            except:
                pass

            movies = test_movies
            shows = test_episodes
    except:
        pass

    dialog = xbmcgui.Dialog()
    pDialog = xbmcgui.DialogProgress()
    if dialog.yesno("NaNscrapers Testing Mode", 'Clear cache?'):
        nanscrapers.clear_cache()
    try:
        dbcon = database.connect(os.path.join(
            xbmc.translatePath(xbmcaddon.Addon("script.module.nanscrapers").getAddonInfo('profile')).decode('utf-8'),
            'url_cache.db'))
        dbcur = dbcon.cursor()
    except:
        dialog.ok("NaNscrapers Testing Mode", 'Error connecting to db')
        sys.exit()

    num_movies = len(movies)
    if num_movies > 0:
        pDialog.create('NaNscrapers Testing mode active', 'please wait')
        index = 0
        for movie in movies:
            index += 1
            title = movie['title']
            year = movie['year']
            imdb = movie['imdb']
            if pDialog.iscanceled():
                pDialog.close()
                break
            pDialog.update((index / num_movies) * 100, "Scraping movie {} of {}".format(index, num_movies), title)
            links_scraper = nanscrapers.scrape_movie(title, year, imdb)
            links_scraper = links_scraper()
            for scraper_links in links_scraper:
                if pDialog.iscanceled():
                    break
                if scraper_links:
                    random.shuffle(scraper_links)

        pDialog.close()
        dbcur.execute("SELECT COUNT(DISTINCT(scraper)) FROM rel_src where episode = ''")
        match = dbcur.fetchone()
        num_movie_scrapers = match[0]

        dbcur.execute("SELECT scraper, count(distinct(urls)) FROM rel_src where episode = '' group by scraper")
        matches = dbcur.fetchall()
        failed = []
        for match in matches:
            if int(match[1]) <= 1:
                failed.append(match[0])

        if len(failed) > 0:
            failedstring = "Failed: {}".format(len(failed))
            for fail in failed:
                failedstring += "\n        - {}".format(str(fail))
        else:
            failedstring = ""

        dbcur.execute("SELECT title, count(distinct(urls)) FROM rel_src where episode = '' group by title")
        matches = dbcur.fetchall()
        failed_movies = []
        for match in matches:
            if int(match[1]) <= 1:
                if int(match[1]) == 1:
                    dbcur.execute(
                        "SELECT scraper, urls FROM rel_src where episode == '' and title == '{}' group by scraper".format(
                            match[0]))
                    new_matches = dbcur.fetchall()
                    found = False
                    for new_match in new_matches:
                        if new_match[1] == "[]":
                            continue
                        else:
                            found = True
                    if not found:
                        failed_movies.append(match[0])
                else:
                    failed_movies.append(match[0])

        if len(failed_movies) > 0:
            failed_movie_string = "Failed movies: {}".format(len(failed_movies))
            for fail in failed_movies:
                for movie in movies:
                    if clean_title(movie['title']).upper() == str(fail):
                        failed_movie_string += "\n        - {}".format(movie["title"])

        else:
            failed_movie_string = ""

    num_shows = len(shows)
    if num_shows > 0:
        pDialog.create('NaNscrapers Testing mode active', 'please wait')
        index = 0
        for show in shows:
            index += 1
            title = show['title']
            show_year = show['show_year']
            year = show['year']
            season = show['season']
            episode = show['episode']
            imdb = show['imdb']
            tvdb = show.get('tvdb', '')

            if pDialog.iscanceled():
                pDialog.close()
                break
            pDialog.update((index / num_shows) * 100, "Scraping show {} of {}".format(index, num_shows), title)
            links_scraper = nanscrapers.scrape_episode(title, show_year, year, season, episode, imdb, tvdb)
            links_scraper = links_scraper()
            for scraper_links in links_scraper:
                if pDialog.iscanceled():
                    break
                if scraper_links:
                    random.shuffle(scraper_links)

        pDialog.close()
        dbcur.execute("SELECT COUNT(DISTINCT(scraper)) FROM rel_src where episode != ''")
        match = dbcur.fetchone()
        num_show_scrapers = match[0]

        dbcur.execute("SELECT scraper, count(distinct(urls)) FROM rel_src where episode != '' group by scraper")
        matches = dbcur.fetchall()
        failed = []
        for match in matches:
            if int(match[1]) <= 1:
                if int(match[1]) == 1:
                    dbcur.execute(
                        "SELECT scraper, urls FROM rel_src where episode != '' and scraper == '{}' group by scraper".format(
                            match[0]))
                    match = dbcur.fetchone()
                    if match[1] == "[]":
                        failed.append(match[0])
                else:
                    failed.append(match[0])

        if len(failed) > 0:
            show_scraper_failedstring = "Failed: {}".format(len(failed))
            for fail in failed:
                show_scraper_failedstring += "\n        - {}".format(str(fail))
        else:
            show_scraper_failedstring = ""

        dbcur.execute("SELECT title, count(distinct(urls)) FROM rel_src where episode != '' group by title")
        matches = dbcur.fetchall()
        failed_shows = []
        for match in matches:
            if int(match[1]) <= 1:
                if int(match[1]) == 1:
                    dbcur.execute(
                        "SELECT scraper, urls FROM rel_src where episode != '' and title == '{}' group by scraper".format(
                            match[0]))
                    new_matches = dbcur.fetchall()
                    found = False
                    for new_match in new_matches:
                        if new_match[1] == "[]":
                            continue
                        else:
                            found = True
                    if not found:
                        failed_shows.append(match[0])
                else:
                    failed_shows.append(match[0])

        if len(failed_shows) > 0:
            failed_show_string = "Failed shows: {}".format(len(failed_shows))
            for fail in failed_shows:
                for show in shows:
                    if clean_title(show['title']).upper() == str(fail):
                        failed_show_string += "\n        - {} S{}-E{}".format(show["title"], show["season"],
                                                                              show["episode"])

        else:
            failed_show_string = ""

    resultstring = 'Results:\n'
    if num_movies > 0:
        resultstring = resultstring + \
                       '    Movie Scrapers: {}\n' \
                       '    {}\n' \
                       '    {}\n'.format(num_movie_scrapers, failedstring, failed_movie_string)
    if num_shows > 0:
        resultstring = resultstring + \
                       '    Episode Scrapers: {}\n' \
                       '    {}\n' \
                       '    {}\n'.format(num_show_scrapers, show_scraper_failedstring, failed_show_string)

    dialog.textviewer("NaNscrapers Testing Mode", resultstring)

コード例 #39

0

ファイルを表示

def test():
    global movies, shows
    try:
        test_movies = []
        test_episodes = []
        profile_path = xbmc.translatePath(
            xbmcaddon.Addon().getAddonInfo('profile')).decode('utf-8')
        test_file = xbmcvfs.File(os.path.join(profile_path, "testings.xml"))
        xml = BeautifulStoneSoup(test_file.read())
        test_file.close()
        items = xml.findAll("item")
        for item in items:
            try:
                content = item.find("content")
                if content:
                    if "movie" in content.text:
                        meta = item.find("meta")
                        test_movies.append({
                            'title': meta.find("title").text,
                            'imdb': meta.find("imdb").text,
                            'year': meta.find("year").text,
                        })
                    elif "episode" in content.text:
                        meta = item.find("meta")
                        test_episodes.append({
                            'title':
                            meta.find("tvshowtitle").text,
                            'show_year':
                            int(meta.find("premiered").text[0:4]),
                            'year':
                            meta.find("year").text,
                            'season':
                            meta.find("season").text,
                            'episode':
                            meta.find("season").text,
                            'imdb':
                            meta.find("imdb").text,
                        })
            except:
                pass

            movies = test_movies
            shows = test_episodes
    except:
        pass

    dialog = xbmcgui.Dialog()
    pDialog = xbmcgui.DialogProgress()
    if dialog.yesno("NaNscrapers Testing Mode", 'Clear cache?'):
        nanscrapers.clear_cache()
    try:
        dbcon = database.connect(
            os.path.join(
                xbmc.translatePath(
                    xbmcaddon.Addon("script.module.nanscrapers").getAddonInfo(
                        'profile')).decode('utf-8'), 'url_cache.db'))
        dbcur = dbcon.cursor()
    except:
        dialog.ok("NaNscrapers Testing Mode", 'Error connecting to db')
        sys.exit()

    num_movies = len(movies)
    if num_movies > 0:
        pDialog.create('NaNscrapers Testing mode active', 'please wait')
        index = 0
        for movie in movies:
            index += 1
            title = movie['title']
            year = movie['year']
            imdb = movie['imdb']
            if pDialog.iscanceled():
                pDialog.close()
                break
            pDialog.update((index / num_movies) * 100,
                           "Scraping movie {} of {}".format(index,
                                                            num_movies), title)
            links_scraper = nanscrapers.scrape_movie(title, year, imdb)
            links_scraper = links_scraper()
            for scraper_links in links_scraper:
                if pDialog.iscanceled():
                    break
                if scraper_links:
                    random.shuffle(scraper_links)

        pDialog.close()
        dbcur.execute(
            "SELECT COUNT(DISTINCT(scraper)) FROM rel_src where episode = ''")
        match = dbcur.fetchone()
        num_movie_scrapers = match[0]

        dbcur.execute(
            "SELECT scraper, count(distinct(urls)) FROM rel_src where episode = '' group by scraper"
        )
        matches = dbcur.fetchall()
        failed = []
        for match in matches:
            if int(match[1]) <= 1:
                failed.append(match[0])

        if len(failed) > 0:
            failedstring = "Failed: {}".format(len(failed))
            for fail in failed:
                failedstring += "\n        - {}".format(str(fail))
        else:
            failedstring = ""

        dbcur.execute(
            "SELECT title, count(distinct(urls)) FROM rel_src where episode = '' group by title"
        )
        matches = dbcur.fetchall()
        failed_movies = []
        for match in matches:
            if int(match[1]) <= 1:
                if int(match[1]) == 1:
                    dbcur.execute(
                        "SELECT scraper, urls FROM rel_src where episode == '' and title == '{}' group by scraper"
                        .format(match[0]))
                    new_matches = dbcur.fetchall()
                    found = False
                    for new_match in new_matches:
                        if new_match[1] == "[]":
                            continue
                        else:
                            found = True
                    if not found:
                        failed_movies.append(match[0])
                else:
                    failed_movies.append(match[0])

        if len(failed_movies) > 0:
            failed_movie_string = "Failed movies: {}".format(
                len(failed_movies))
            for fail in failed_movies:
                for movie in movies:
                    if clean_title(movie['title']).upper() == str(fail):
                        failed_movie_string += "\n        - {}".format(
                            movie["title"])

        else:
            failed_movie_string = ""

    num_shows = len(shows)
    if num_shows > 0:
        pDialog.create('NaNscrapers Testing mode active', 'please wait')
        index = 0
        for show in shows:
            index += 1
            title = show['title']
            show_year = show['show_year']
            year = show['year']
            season = show['season']
            episode = show['episode']
            imdb = show['imdb']
            tvdb = show.get('tvdb', '')

            if pDialog.iscanceled():
                pDialog.close()
                break
            pDialog.update((index / num_shows) * 100,
                           "Scraping show {} of {}".format(index,
                                                           num_shows), title)
            links_scraper = nanscrapers.scrape_episode(title, show_year, year,
                                                       season, episode, imdb,
                                                       tvdb)
            links_scraper = links_scraper()
            for scraper_links in links_scraper:
                if pDialog.iscanceled():
                    break
                if scraper_links:
                    random.shuffle(scraper_links)

        pDialog.close()
        dbcur.execute(
            "SELECT COUNT(DISTINCT(scraper)) FROM rel_src where episode != ''")
        match = dbcur.fetchone()
        num_show_scrapers = match[0]

        dbcur.execute(
            "SELECT scraper, count(distinct(urls)) FROM rel_src where episode != '' group by scraper"
        )
        matches = dbcur.fetchall()
        failed = []
        for match in matches:
            if int(match[1]) <= 1:
                if int(match[1]) == 1:
                    dbcur.execute(
                        "SELECT scraper, urls FROM rel_src where episode != '' and scraper == '{}' group by scraper"
                        .format(match[0]))
                    match = dbcur.fetchone()
                    if match[1] == "[]":
                        failed.append(match[0])
                else:
                    failed.append(match[0])

        if len(failed) > 0:
            show_scraper_failedstring = "Failed: {}".format(len(failed))
            for fail in failed:
                show_scraper_failedstring += "\n        - {}".format(str(fail))
        else:
            show_scraper_failedstring = ""

        dbcur.execute(
            "SELECT title, count(distinct(urls)) FROM rel_src where episode != '' group by title"
        )
        matches = dbcur.fetchall()
        failed_shows = []
        for match in matches:
            if int(match[1]) <= 1:
                if int(match[1]) == 1:
                    dbcur.execute(
                        "SELECT scraper, urls FROM rel_src where episode != '' and title == '{}' group by scraper"
                        .format(match[0]))
                    new_matches = dbcur.fetchall()
                    found = False
                    for new_match in new_matches:
                        if new_match[1] == "[]":
                            continue
                        else:
                            found = True
                    if not found:
                        failed_shows.append(match[0])
                else:
                    failed_shows.append(match[0])

        if len(failed_shows) > 0:
            failed_show_string = "Failed shows: {}".format(len(failed_shows))
            for fail in failed_shows:
                for show in shows:
                    if clean_title(show['title']).upper() == str(fail):
                        failed_show_string += "\n        - {} S{}-E{}".format(
                            show["title"], show["season"], show["episode"])

        else:
            failed_show_string = ""

    resultstring = 'Results:\n'
    if num_movies > 0:
        resultstring = resultstring + \
                       '    Movie Scrapers: {}\n' \
                       '    {}\n' \
                       '    {}\n'.format(num_movie_scrapers, failedstring, failed_movie_string)
    if num_shows > 0:
        resultstring = resultstring + \
                       '    Episode Scrapers: {}\n' \
                       '    {}\n' \
                       '    {}\n'.format(num_show_scrapers, show_scraper_failedstring, failed_show_string)

    dialog.textviewer("NaNscrapers Testing Mode", resultstring)

コード例 #40

0

ファイルを表示

ファイル: primewire.py プロジェクト: 24061993/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb,
                       tvdb):
        try:
            html = BeautifulSoup(self.get_html(title, self.tvsearch_link))
            index_items = html.findAll(
                'div', attrs={'class': re.compile('index_item.+?')})
            title = 'watch' + clean_title(title).replace(": ", "")

            for index_item in index_items:
                try:
                    links = index_item.findAll('a')
                    for link in links:
                        href = link['href']
                        link_title = link['title']
                        try:
                            href = urlparse.parse_qs(
                                urlparse.urlparse(href).query)['u'][0]
                        except:
                            pass
                        try:
                            href = urlparse.parse_qs(
                                urlparse.urlparse(href).query)['q'][0]
                        except:
                            pass

                        if title == clean_title(
                                link_title
                        ):  # href is the show page relative url
                            show_url = urlparse.urljoin(self.base_link, href)
                            html = BeautifulSoup(
                                proxy.get(show_url, 'tv_episode_item'))

                            seasons = html.findAll(
                                'div', attrs={'class': 'show_season'})
                            for scraped_season in seasons:
                                if scraped_season['data-id'] == season:
                                    tv_episode_items = scraped_season.findAll(
                                        'div',
                                        attrs={'class': 'tv_episode_item'})
                                    for tv_episode_item in tv_episode_items:
                                        links = tv_episode_item.findAll('a')
                                        for link in links:
                                            if link.contents[0].strip(
                                            ) == "E%s" % episode:
                                                episode_href = link['href']
                                                try:
                                                    episode_href = \
                                                        urlparse.parse_qs(urlparse.urlparse(episode_href).query)['u'][0]
                                                except:
                                                    pass
                                                try:
                                                    episode_href = \
                                                        urlparse.parse_qs(urlparse.urlparse(episode_href).query)['q'][0]
                                                except:
                                                    pass
                                                return self.sources(
                                                    episode_href)
                except:
                    continue
        except:
            pass
        return []