Python replaceHTMLCodes Exemples, nanscrapers.common.replaceHTMLCodes Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : watchfree.py Projet : 24061993/noobsandnerds

    def sources(self, url):
        sources = []
        try:
            if url == None: return sources
            absolute_url = urlparse.urljoin(self.base_link, url)
            html = BeautifulSoup(proxy.get(absolute_url, 'link_ite'))
            tables = html.findAll('table',
                                  attrs={'class': re.compile('link_ite.+?')})
            for table in tables:
                rows = table.findAll('tr')
                for row in rows:
                    link = row.findAll('a')[-1]
                    href = link['href']

                    if not 'gtfo' in href:
                        continue

                    try:
                        href = urlparse.parse_qs(
                            urlparse.urlparse(href).query)['u'][0]
                    except:
                        pass
                    try:
                        href = urlparse.parse_qs(
                            urlparse.urlparse(href).query)['q'][0]
                    except:
                        pass

                    href = base64.b64decode(
                        urlparse.parse_qs(
                            urlparse.urlparse(href).query)['gtfo'][0])
                    href = replaceHTMLCodes(href)

                    host = re.findall(
                        '([\w]+[.][\w]+)$',
                        urlparse.urlparse(href.strip().lower()).netloc)[0]
                    host = replaceHTMLCodes(host)
                    host = host.encode('utf-8')

                    quality = row.findAll('div', attrs={'class':
                                                        'quality'})[0].text
                    if "CAM" in quality or 'TS' in quality:
                        quality = 'CAM'
                    if 'HD' in quality:
                        pass
                    else:
                        quality = 'SD'

                    sources.append({
                        'source': host,
                        'quality': quality,
                        'scraper': self.name,
                        'url': href,
                        'direct': False
                    })
        except:
            pass

        return sources

Exemple #2

0

Afficher le fichier

Fichier : primewire.py Projet : 24061993/noobsandnerds

    def sources(self, url):
        sources = []
        try:
            if url == None: return sources

            url = urlparse.urljoin(self.base_link, url)
            html = proxy.get(url, 'choose_tabs')
            parsed_html = BeautifulSoup(html)

            table_bodies = parsed_html.findAll('tbody')
            for table_body in table_bodies:
                link = table_body.findAll('a')[0]["href"]
                try:
                    link = urlparse.parse_qs(
                        urlparse.urlparse(link).query)['u'][
                            0]  # replace link with ?u= part if present
                except:
                    pass
                try:
                    link = urlparse.parse_qs(
                        urlparse.urlparse(link).query)['q'][
                            0]  # replace link with ?q= part if present
                except:
                    pass

                link = urlparse.parse_qs(urlparse.urlparse(link).query)['url'][
                    0]  # replace link with ?url= part if present
                link = base64.b64decode(link)  # decode base 64
                if link.startswith("//"):
                    link = "http:" + link
                link = replaceHTMLCodes(link)
                link = link.encode('utf-8')

                host = re.findall(
                    '([\w]+[.][\w]+)$',
                    urlparse.urlparse(link.strip().lower()).netloc)[0]
                host = replaceHTMLCodes(host)
                host = host.encode('utf-8')

                quality = table_body.findAll('span')[0]["class"]
                if quality == 'quality_cam' or quality == 'quality_ts':
                    quality = 'CAM'
                elif quality == 'quality_dvd':
                    quality = 'SD'

                sources.append({
                    'source': host,
                    'quality': quality,
                    'scraper': 'Primewire',
                    'url': link,
                    'direct': False
                })

            return sources
        except:
            return sources

Exemple #3

0

Afficher le fichier

Fichier : primewire.py Projet : noobsandnerds/noobsandnerds

    def sources(self, url):
        sources = []
        try:
            if url == None: return sources

            url = urlparse.urljoin(self.base_link, url)
            html = proxy.get(url, 'choose_tabs')
            parsed_html = BeautifulSoup(html)

            table_bodies = parsed_html.findAll('tbody')
            for table_body in table_bodies:
                link = table_body.findAll('a')[0]["href"]
                try:
                    link = urlparse.parse_qs(urlparse.urlparse(link).query)['u'][
                        0]  # replace link with ?u= part if present
                except:
                    pass
                try:
                    link = urlparse.parse_qs(urlparse.urlparse(link).query)['q'][
                        0]  # replace link with ?q= part if present
                except:
                    pass

                link = urlparse.parse_qs(urlparse.urlparse(link).query)['url'][
                    0]  # replace link with ?url= part if present
                link = base64.b64decode(link)  # decode base 64

                if link.startswith("//"):
                    link = "http:" + link
                link = replaceHTMLCodes(link)
                link = link.encode('utf-8')

                host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(link.strip().lower()).netloc)[0]
                host = replaceHTMLCodes(host)
                host = host.encode('utf-8')

                quality = table_body.findAll('span')[0]["class"]
                if quality == 'quality_cam' or quality == 'quality_ts':
                    quality = 'CAM'
                elif quality == 'quality_dvd':
                    quality = 'SD'

                if "qertewrt" in host:
                    continue

                sources.append(
                    {'source': host, 'quality': quality, 'scraper': 'Primewire', 'url': link, 'direct': False})

            return sources
        except:
            return sources

Exemple #4

0

Afficher le fichier

Fichier : watchfree.py Projet : noobsandnerds/noobsandnerds

    def sources(self, url):
        sources = []
        try:
            if url == None: return sources
            absolute_url = urlparse.urljoin(self.base_link, url)
            html = BeautifulSoup(proxy.get(absolute_url, 'link_ite'))
            tables = html.findAll('table', attrs={'class': re.compile('link_ite.+?')})
            for table in tables:
                rows = table.findAll('tr')
                for row in rows:
                    link = row.findAll('a')[-1]
                    href = link['href']

                    if not 'gtfo' in href:
                        continue

                    try:
                        href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0]
                    except:
                        pass
                    try:
                        href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0]
                    except:
                        pass

                    href = base64.b64decode(urlparse.parse_qs(urlparse.urlparse(href).query)['gtfo'][0])
                    href = replaceHTMLCodes(href)

                    host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(href.strip().lower()).netloc)[0]
                    host = replaceHTMLCodes(host)
                    host = host.encode('utf-8')

                    if "qertewrt" in host:
                        continue

                    quality = row.findAll('div', attrs={'class': 'quality'})[0].text
                    if "CAM" in quality or 'TS' in quality:
                        quality = 'CAM'
                    if 'HD' in quality:
                        pass
                    else:
                        quality = 'SD'

                    sources.append(
                        {'source': host, 'quality': quality, 'scraper': self.name, 'url': href, 'direct': False})
        except:
            pass

        return sources

Exemple #5

0

Afficher le fichier

    def scrape_episode(self, title, show_year, year, season, episode, imdb,
                       tvdb):
        try:
            headers = {'User-Agent': random_agent()}
            query = "%s+season+%s" % (urllib.quote_plus(title), season)
            query = self.search_link % query
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            checkseason = cleaned_title + "season" + season
            # print("ONEMOVIES", query,checkseason)
            html = BeautifulSoup(
                requests.get(query, headers=headers, timeout=30).content)
            containers = html.findAll('div', attrs={'class': 'ml-item'})
            for result in containers:
                links = result.findAll('a')
                # print("ONEMOVIES", links)
                for link in links:
                    link_title = str(link['title'])
                    href = str(link['href'])

                    # print("ONEMOVIES", link_title, href, info)
                    if clean_title(link_title) == checkseason:
                        ep_id = '?episode=%01d' % int(episode)
                        href = href + ep_id
                        # print("ONEMOVIES Passed", href)
                        return self.sources(replaceHTMLCodes(href))

        except:
            pass
        return []

Exemple #6

0

Afficher le fichier

    def scrape_movie(self, title, year, imdb):
        try:
            # print("ONEMOVIES")
            headers = {'User-Agent': random_agent()}
            # print("ONEMOVIES", headers)
            query = self.search_link % (urllib.quote_plus(
                title.replace("'", " ")))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            # print("ONEMOVIES", query)
            html = BeautifulSoup(
                requests.get(query, headers=headers, timeout=30).content)
            containers = html.findAll('div', attrs={'class': 'ml-item'})
            for result in containers:
                links = result.findAll('a')
                # print("ONEMOVIES", links)
                for link in links:
                    link_title = str(link['title'])
                    href = str(link['href'])
                    info = str(link['data-url'])
                    # print("ONEMOVIES", link_title, href, info)
                    if clean_title(link_title) == cleaned_title:
                        html = requests.get(info, headers=headers).content
                        pattern = '<div class="jt-info">%s</div>' % year
                        match = re.findall(pattern, html)
                        if match:
                            # print("ONEMOVIES MATCH", href)
                            return self.sources(replaceHTMLCodes(href))

        except:
            pass
        return []

Exemple #7

0

Afficher le fichier

Fichier : moviexk.py Projet : noobsandnerds/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            print("MOVIEXK")
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(title) + "+" + str(year))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)

            containers = html.findAll('div', attrs={'class': 'inner'})
            for container in containers:
                print("MOVIEXK r1", container)
                movie_link = container.findAll('a')[0]
                r_href = movie_link['href']
                print("MOVIEXK r2", r_href)
                r_title = movie_link['title']
                link_year = container.findAll('span', attrs={'class': 'year'})[0].findAll('a')[0].text
                print("MOVIEXK r3", r_title)
                print("MOVIEXK RESULTS", r_title, r_href)
                if str(year) == link_year:
                    if cleaned_title in clean_title(r_title):
                        redirect = requests.get(r_href, headers=headers, timeout=30).text
                        r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0]
                        r_url = r_url.encode('utf-8')
                        print("MOVIEXK PLAY URL", r_url)
                        return self.sources(replaceHTMLCodes(r_url))
        except:
            pass
        return []

Exemple #8

0

Afficher le fichier

 def process_results_page(self, html, title, artist, referer):
     sources = []
     result = html.find("div", "result")
     for item in result.findAll("div", "item"):
         title_block = item.find("div", "title")
         link = title_block.find("a")
         link_href = link["href"]
         spans = link.findAll("span")
         link_artist = spans[0].text
         link_title = replaceHTMLCodes(spans[1].text)
         if not clean_title(link_title) == clean_title(title):
             continue
         if not clean_title(artist) == clean_title(link_artist):
             continue
         headers2 = headers
         headers2["referer"] = referer
         html = BS(session.get(link_href, headers=headers2).content)
         tab_content = html.find("div", "tab-content")
         music_links = tab_content.findAll("a", "red-link")
         for music_link in music_links:
             sources.append({
                 'source': 'mp3',
                 'quality': 'HD',
                 'scraper': self.name,
                 'url': music_link["href"],
                 'direct': True
             })
         return sources

Exemple #9

0

Afficher le fichier

Fichier : onemovies.py Projet : noobsandnerds/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb):
        try:
            headers = {'User-Agent': random_agent()}
            query = "%s+season+%s" % (urllib.quote_plus(title), season)
            query = self.search_link % query
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            checkseason = cleaned_title + "season" + season
            # print("ONEMOVIES", query,checkseason)
            html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)
            containers = html.findAll('div', attrs={'class': 'ml-item'})
            for result in containers:
                links = result.findAll('a')
                # print("ONEMOVIES", links)
                for link in links:
                    link_title = str(link['title'])
                    href = str(link['href'])

                    # print("ONEMOVIES", link_title, href, info)
                    if clean_title(link_title) == checkseason:
                        ep_id = '?episode=%01d' % int(episode)
                        href = href + ep_id
                        # print("ONEMOVIES Passed", href)
                        return self.sources(replaceHTMLCodes(href))

        except:
            pass
        return []

Exemple #10

0

Afficher le fichier

Fichier : sezonlukdizi.py Projet : bopopescu/theapprentice

 def scrape_episode(self, title, show_year, year, season, episode, imdb,
                    tvdb):
     url_title = title.replace(' ', '-').replace('.', '-').replace(
         ":", "").replace("!", "").replace("?", "").lower()
     episode_url = '/%s/%01d-sezon-%01d-bolum.html' % (
         url_title, int(season), int(episode))
     return self.sources(replaceHTMLCodes(episode_url))

Exemple #11

0

Afficher le fichier

Fichier : watchepisodes.py Projet : smoke61/modules4all

    def sources(self, url):
        sources = []
        try:
            if url == None: return sources
            headers = {'User-Agent': random_agent()}
            html = BeautifulSoup(
                requests.get(url, headers=headers, timeout=30).content)
            r = html.findAll('div', attrs={'class': 'site'})
            for container in r:
                r_url = container.findAll('a')[0]['data-actuallink'].encode(
                    'utf-8')
                host = re.findall(
                    '([\w]+[.][\w]+)$',
                    urlparse.urlparse(r_url.strip().lower()).netloc)[0]
                host = replaceHTMLCodes(host)
                host = host.encode('utf-8')
                sources.append({
                    'source': host,
                    'quality': 'SD',
                    'scraper': self.name,
                    'url': r_url,
                    'direct': False
                })

        except:
            pass
        return sources

Exemple #12

0

Afficher le fichier

Fichier : onemovies.py Projet : noobsandnerds/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            # print("ONEMOVIES")
            headers = {'User-Agent': random_agent()}
            # print("ONEMOVIES", headers)
            query = self.search_link % (urllib.quote_plus(title.replace("'", " ")))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            # print("ONEMOVIES", query)
            html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)
            containers = html.findAll('div', attrs={'class': 'ml-item'})
            for result in containers:
                links = result.findAll('a')
                # print("ONEMOVIES", links)
                for link in links:
                    link_title = str(link['title'])
                    href = str(link['href'])
                    info = str(link['data-url'])
                    # print("ONEMOVIES", link_title, href, info)
                    if clean_title(link_title) == cleaned_title:
                        html = requests.get(info, headers=headers).content
                        pattern = '<div class="jt-info">%s</div>' % year
                        match = re.findall(pattern, html)
                        if match:
                            # print("ONEMOVIES MATCH", href)
                            return self.sources(replaceHTMLCodes(href))



        except:
            pass
        return []

Exemple #13

0

Afficher le fichier

Fichier : xmovies.py Projet : 24061993/noobsandnerds

 def scrape_movie(self, title, year, imdb):
     try:
         headers = {'User-Agent': random_agent()}
         query = urlparse.urljoin(self.base_link, self.search_link)
         query = query % urllib.quote_plus(title)
         # print ("XMOVIES query", query)
         cleaned_title = clean_title(title)
         html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)
         containers = html.findAll('div', attrs={'class': 'item_movie'})
         # print ("XMOVIES r1", containers)
         for container in containers:
             try:
                 links = container.findAll('h2', attrs={'class': 'tit'})[0]
                 r = links.findAll('a')
                 for link in r:
                     link_title = link['title'].encode('utf-8')
                     href = link['href'].encode('utf-8')
                     if len(link_title) > 0 and len(href) > 0:
                         parsed = re.findall('(.+?) \((\d{4})', link_title)
                         parsed_title = parsed[0][0]
                         parsed_years = parsed[0][1]
                         if cleaned_title.lower() == clean_title(parsed_title).lower() and year == parsed_years:
                             if not "http:" in href: href = "http:" + href
                             return self.sources(replaceHTMLCodes(href))
             except:
                 pass
     except:
         pass
     return []

Exemple #14

0

Afficher le fichier

 def scrape_episode(self, title, show_year, year, season, episode, imdb,
                    tvdb):
     try:
         headers = {'User-Agent': random_agent()}
         query = self.search_link % (urllib.quote_plus(title))
         query = urlparse.urljoin(self.base_link, query)
         cleaned_title = clean_title(title)
         ep_id = int(episode)
         season_id = int(season)
         html = requests.get(query, headers=headers, timeout=30).json()
         results = html['series']
         for item in results:
             r_title = item['label'].encode('utf-8')
             r_link = item['seo'].encode('utf-8')
             if cleaned_title == clean_title(r_title):
                 r_page = self.base_link + "/" + r_link
                 # print("WATCHEPISODES r1", r_title,r_page)
                 r_html = BeautifulSoup(
                     requests.get(r_page, headers=headers,
                                  timeout=30).content)
                 r = r_html.findAll(
                     'div', attrs={'class': re.compile('\s*el-item\s*')})
                 for container in r:
                     try:
                         r_href = container.findAll('a')[0]['href'].encode(
                             'utf-8')
                         r_title = container.findAll(
                             'a')[0]['title'].encode('utf-8')
                         # print("WATCHEPISODES r3", r_href,r_title)
                         episode_check = "[sS]%02d[eE]%02d" % (int(season),
                                                               int(episode))
                         match = re.search(episode_check, r_title)
                         if match:
                             # print("WATCHEPISODES PASSED EPISODE", r_href)
                             return self.sources(replaceHTMLCodes(r_href))
                         else:
                             match2 = re.search(episode_check, r_href)
                             if match2:
                                 # print("WATCHEPISODES PASSED EPISODE", r_href)
                                 return self.sources(
                                     replaceHTMLCodes(r_href))
                     except:
                         pass
     except:
         pass
     return []

Exemple #15

0

Afficher le fichier

Fichier : watchfree.py Projet : noobsandnerds/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb):
        try:
            query = urlparse.urljoin(self.base_link,
                                     self.tvsearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0]))

            html = proxy.get(query, 'item')
            if 'page=2' in html or 'page%3D2' in html:
                html2 = proxy.get(query + '&page=2', 'item')
                html += html2

            html = BeautifulSoup(html)

            cleaned_title = 'watchputlocker' + clean_title(title)
            years = ['%s' % str(year), '%s' % str(int(year) + 1), '%s' % str(int(year) - 1)]

            items = html.findAll('div', attrs={'class': 'item'})

            show_url = None
            for item in items:
                links = item.findAll('a')
                for link in links:
                    href = link['href']
                    link_title = link['title']
                    try:
                        href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0]
                    except:
                        pass
                    try:
                        href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0]
                    except:
                        pass
                    if cleaned_title == clean_title(link_title) and show_year in link_title:
                        url = re.findall('(?://.+?|)(/.+)', href)[0]
                        show_url = urlparse.urljoin(self.base_link, replaceHTMLCodes(url))
                    else:
                        continue

                    html = BeautifulSoup(proxy.get(show_url, 'tv_episode_item'))
                    season_items = html.findAll('div', attrs={'class': 'show_season'})
                    for season_item in season_items:
                        if season_item["data-id"] != season:
                            continue
                        episode_items = season_item.findAll('div', attrs={'class': 'tv_episode_item'})
                        for episode_item in episode_items:
                            link = episode_item.findAll('a')[-1]
                            href = link["href"]
                            link_episode = link.contents[0].strip()
                            if link_episode != "E%s" % (episode):
                                continue
                            link_airdate = link.findAll('span', attrs={'class': 'tv_num_versions'})[-1]  # WTF
                            link_airdate = link_airdate.contents[0]
                            if any(candidate_year in link_airdate for candidate_year in years):
                                return self.sources(href)

        except:
            pass
        return []

Exemple #16

0

Afficher le fichier

Fichier : moviexk.py Projet : 24061993/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb,
                       tvdb):
        try:
            print("MOVIEXK")
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(title))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            ep_id = int(episode)
            season_id = int(season)
            html = BeautifulSoup(
                requests.get(query, headers=headers, timeout=30).content)

            containers = html.findAll('div', attrs={'class': 'inner'})
            for container in containers:
                print("MOVIEXK r1", container)
                show_link = container.findAll('a')[0]
                r_href = show_link['href']
                print("MOVIEXK r2", r_href)
                r_title = show_link['title']
                print("MOVIEXK r3", r_title)
                print("MOVIEXK r4", r_title, r_href)
                if cleaned_title in clean_title(
                        r_title) and "tv" in r_title.lower():
                    redirect = requests.get(r_href,
                                            headers=headers,
                                            timeout=30).text
                    r_url = re.findall('<a href="(.*?)" class="btn-watch"',
                                       redirect)[0]
                    r_url = r_url.encode('utf-8')
                    links = BeautifulSoup(
                        requests.get(r_url, headers=headers,
                                     timeout=30).content)
                    ep_items = links.findAll('ul',
                                             attrs={'class': 'episodelist'})
                    for items in ep_items:
                        ep_links = items.findAll('a')
                        for r in ep_links:
                            print("MOVIEXK r5", r)
                            ep_url = r['href'].encode('utf-8')
                            ep_title = r['title'].encode('utf-8')
                            print("MOVIEXK r6", ep_url, ep_title)
                            clean_ep_title = clean_title(ep_title)
                            if "s%02de%02d" % (
                                    season_id,
                                    ep_id) in clean_ep_title or "s%02d%02d" % (
                                        season_id, ep_id
                                    ) in clean_ep_title or "s%02d%d" % (
                                        season_id, ep_id
                                    ) in clean_ep_title or "epse%d%d" % (
                                        season_id, ep_id) in clean_ep_title:
                                return self.sources(replaceHTMLCodes(ep_url))
        except:
            pass
        return []

Exemple #17

0

Afficher le fichier

Fichier : watchepisodes.py Projet : noobsandnerds/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb):
        try:
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(title))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            ep_id = int(episode)
            season_id = int(season)
            html = requests.get(query, headers=headers, timeout=30).json()
            results = html['series']
            for item in results:
				r_title = item['label'].encode('utf-8')
				r_link = item['seo'].encode('utf-8')
				if cleaned_title == clean_title(r_title):
					r_page = self.base_link + "/" + r_link
					# print("WATCHEPISODES r1", r_title,r_page)
					r_html = BeautifulSoup(requests.get(r_page, headers=headers, timeout=30).content)
					r = r_html.findAll('div', attrs={'class': re.compile('\s*el-item\s*')})
					for container in r:
						try:
							r_href = container.findAll('a')[0]['href'].encode('utf-8')
							r_title = container.findAll('a')[0]['title'].encode('utf-8')
							# print("WATCHEPISODES r3", r_href,r_title)
							episode_check = "[sS]%02d[eE]%02d" % (int(season), int(episode))
							match = re.search(episode_check, r_title)
							if match:
								# print("WATCHEPISODES PASSED EPISODE", r_href)
								return self.sources(replaceHTMLCodes(r_href))
							else:
								match2 = re.search(episode_check, r_href)
								if match2:
									# print("WATCHEPISODES PASSED EPISODE", r_href)
									return self.sources(replaceHTMLCodes(r_href))									
						except:
							pass
        except:
            pass
        return []

Exemple #18

0

Afficher le fichier

Fichier : watchfree.py Projet : 24061993/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            query = self.moviesearch_link % urllib.quote_plus(
                title.replace('\'', '').rsplit(':', 1)[0])
            query = urlparse.urljoin(self.base_link, query)

            html = proxy.get(query, 'item')
            if 'page=2' in html or 'page%3D2' in html:
                html2 = proxy.get(query + '&page=2', 'item')
                html += html2

            html = BeautifulSoup(html)

            cleaned_title = 'watchputlocker' + clean_title(title)
            years = [
                '(%s)' % str(year),
                '(%s)' % str(int(year) + 1),
                '(%s)' % str(int(year) - 1)
            ]

            items = html.findAll('div', attrs={'class': 'item'})

            for item in items:
                links = item.findAll('a')
                for link in links:
                    href = link['href']
                    link_title = link['title']
                    if any(candidate_year in link_title
                           for candidate_year in years):
                        try:
                            href = urlparse.parse_qs(
                                urlparse.urlparse(href).query)['u'][0]
                        except:
                            pass
                        try:
                            href = urlparse.parse_qs(
                                urlparse.urlparse(href).query)['q'][0]
                        except:
                            pass

                        if cleaned_title == clean_title(link_title):
                            url = re.findall('(?://.+?|)(/.+)', href)[0]
                            url = replaceHTMLCodes(url)
                            return self.sources(url)
        except:
            pass
        return []

Exemple #19

0

Afficher le fichier

Fichier : watchepisodes.py Projet : noobsandnerds/noobsandnerds

    def sources(self, url):
        sources = []
        try:
            if url == None: return sources
            headers = {'User-Agent': random_agent()}
            html = BeautifulSoup(requests.get(url, headers=headers, timeout=30).content)
            r = html.findAll('div', attrs={'class': 'site'})
            for container in r:
				r_url = container.findAll('a')[0]['data-actuallink'].encode('utf-8')
				host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(r_url.strip().lower()).netloc)[0]
				host = replaceHTMLCodes(host)
				host = host.encode('utf-8')
				sources.append({'source': host, 'quality': 'SD', 'scraper': self.name, 'url': r_url,'direct': False})

        except:
            pass
        return sources

Exemple #20

0

Afficher le fichier

Fichier : moviexk.py Projet : noobsandnerds/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb):
        try:
            print("MOVIEXK")
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(title))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            ep_id = int(episode)
            season_id = int(season)
            html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)

            containers = html.findAll('div', attrs={'class': 'inner'})
            for container in containers:
                print("MOVIEXK r1", container)
                show_link = container.findAll('a')[0]
                r_href = show_link['href']
                print("MOVIEXK r2", r_href)
                r_title = show_link['title']
                print("MOVIEXK r3", r_title)
                print("MOVIEXK r4", r_title, r_href)
                if cleaned_title in clean_title(r_title) and "tv" in r_title.lower():
                    redirect = requests.get(r_href, headers=headers, timeout=30).text
                    r_url = re.findall('<a href="(.*?)" class="btn-watch"', redirect)[0]
                    r_url = r_url.encode('utf-8')
                    links = BeautifulSoup(requests.get(r_url, headers=headers, timeout=30).content)
                    ep_items = links.findAll('ul', attrs={'class': 'episodelist'})
                    for items in ep_items:
                        ep_links = items.findAll('a')
                        for r in ep_links:
                            print("MOVIEXK r5", r)
                            ep_url = r['href'].encode('utf-8')
                            ep_title = r['title'].encode('utf-8')
                            print("MOVIEXK r6", ep_url, ep_title)
                            clean_ep_title = clean_title(ep_title)
                            if "s%02de%02d" % (season_id, ep_id) in clean_ep_title or "s%02d%02d" % (
                                    season_id, ep_id) in clean_ep_title or "s%02d%d" % (
                                    season_id, ep_id) in clean_ep_title or "epse%d%d" % (season_id, ep_id) in clean_ep_title :
                                return self.sources(replaceHTMLCodes(ep_url))
        except:
            pass
        return []

Exemple #21

0

Afficher le fichier

    def scrape_movie(self, title, year, imdb):
        try:
            # print("MOVIEGO INIT")
            headers = {'User-Agent': random_agent()}
            searchquery = self.search_link % (urllib.quote_plus(title), year)
            query = urlparse.urljoin(self.base_link, searchquery)
            cleaned_title = clean_title(title)
            html = requests.get(query, headers=headers).content
            html = BeautifulSoup(html)

            containers = html.findAll('div', attrs={'class': 'short_content'})
            # print("MOVIEGO MOVIES",containers)
            for items in containers:
                href = items.findAll('a')[0]['href']
                title = items.findAll('div', attrs={'class': 'short_header'})[0]
                if year in str(title):
                    title = normalize(str(title))
                    if title == cleaned_title:
                        return self.sources(replaceHTMLCodes(href))

        except:
            return []

Exemple #22

0

Afficher le fichier

Fichier : watchfree.py Projet : noobsandnerds/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            query = self.moviesearch_link % urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0])
            query = urlparse.urljoin(self.base_link, query)

            html = proxy.get(query, 'item')
            if 'page=2' in html or 'page%3D2' in html:
                html2 = proxy.get(query + '&page=2', 'item')
                html += html2

            html = BeautifulSoup(html)

            cleaned_title = 'watchputlocker' + clean_title(title)
            years = ['(%s)' % str(year), '(%s)' % str(int(year) + 1), '(%s)' % str(int(year) - 1)]

            items = html.findAll('div', attrs={'class': 'item'})

            for item in items:
                links = item.findAll('a')
                for link in links:
                    href = link['href']
                    link_title = link['title']
                    if any(candidate_year in link_title for candidate_year in years):
                        try:
                            href = urlparse.parse_qs(urlparse.urlparse(href).query)['u'][0]
                        except:
                            pass
                        try:
                            href = urlparse.parse_qs(urlparse.urlparse(href).query)['q'][0]
                        except:
                            pass

                        if cleaned_title == clean_title(link_title):
                            url = re.findall('(?://.+?|)(/.+)', href)[0]
                            url = replaceHTMLCodes(url)
                            return self.sources(url)
        except:
            pass
        return []

Exemple #23

0

Afficher le fichier

 def scrape_movie(self, title, year, imdb):
     try:
         headers = {'User-Agent': random_agent()}
         query = self.search_link % (urllib.quote_plus(title))
         query = urlparse.urljoin(self.base_link, query)
         cleaned_title = clean_title(title)
         html = BeautifulSoup(requests.get(query, headers=headers, timeout=30).content)
         containers = html.findAll('div', attrs={'class': 'cell_container'})
         for container in containers:
             links = container.findAll('a')
             for link in links:
                 link_title = link['title']
                 href = link['href']
                 if len(link_title) > 0 and len(href) > 0:
                     parsed = re.findall('(.+?) \((\d{4})', link_title)
                     parsed_title = parsed[0][0]
                     parsed_years = parsed[0][1]
                     if cleaned_title == clean_title(parsed_title) and year == parsed_years:
                         return self.sources(replaceHTMLCodes(href))
     except:
         pass
     return []

Exemple #24

0

Afficher le fichier

Fichier : moviexk.py Projet : 24061993/noobsandnerds

    def scrape_movie(self, title, year, imdb):
        try:
            print("MOVIEXK")
            headers = {'User-Agent': random_agent()}
            query = self.search_link % (urllib.quote_plus(title) + "+" +
                                        str(year))
            query = urlparse.urljoin(self.base_link, query)
            cleaned_title = clean_title(title)
            html = BeautifulSoup(
                requests.get(query, headers=headers, timeout=30).content)

            containers = html.findAll('div', attrs={'class': 'inner'})
            for container in containers:
                print("MOVIEXK r1", container)
                movie_link = container.findAll('a')[0]
                r_href = movie_link['href']
                print("MOVIEXK r2", r_href)
                r_title = movie_link['title']
                link_year = container.findAll('span',
                                              attrs={'class': 'year'
                                                     })[0].findAll('a')[0].text
                print("MOVIEXK r3", r_title)
                print("MOVIEXK RESULTS", r_title, r_href)
                if str(year) == link_year:
                    if cleaned_title in clean_title(r_title):
                        redirect = requests.get(r_href,
                                                headers=headers,
                                                timeout=30).text
                        r_url = re.findall('<a href="(.*?)" class="btn-watch"',
                                           redirect)[0]
                        r_url = r_url.encode('utf-8')
                        print("MOVIEXK PLAY URL", r_url)
                        return self.sources(replaceHTMLCodes(r_url))
        except:
            pass
        return []

Exemple #25

0

Afficher le fichier

    def sources(self, url):
        sources = []
        try:
            # print("ONEMOVIES SOURCES", url)

            if url == None: return sources
            referer = url
            headers = {'User-Agent': random_agent()}
            url = url.replace('/watching.html', '')
            html = requests.get(url, headers=headers).content
            # print ("ONEMOVIES Source", html)
            try:
                url, episode = re.findall('(.+?)\?episode=(\d*)$', url)[0]
            except:
                episode = None
            vid_id = re.findall('-(\d+)', url)[-1]
            # print ("ONEMOVIES", vid_id)
            quality = re.findall('<span class="quality">(.*?)</span>', html)
            quality = str(quality)
            if quality == 'cam' or quality == 'ts':
                quality = 'CAM'
            elif quality == 'hd':
                quality = '720'
            else:
                quality = '480'
            try:
                headers = {'X-Requested-With': 'XMLHttpRequest'}
                headers['Referer'] = referer
                headers['User-Agent'] = random_agent()
                u = urlparse.urljoin(self.base_link, self.server_link % vid_id)
                # print("SERVERS", u)
                r = BeautifulSoup(requests.get(u, headers=headers).content)
                # print("SERVERS", r)
                containers = r.findAll('div', attrs={'class': 'les-content'})
                for result in containers:
                    links = result.findAll('a')
                    # print("ONEMOVIES", links)
                    for link in links:
                        title = str(link['title'])
                        # print("ONEMOVIES TITLE", title)
                        if not episode == None:
                            title = re.findall('Episode\s+(\d+):', title)[0]
                            title = '%01d' % int(title)
                            if title == episode:
                                episode_id = str(link['episode-id'])
                            # print("ONEMOVIES EPISODE", episode_id)
                            else:
                                continue

                        else:
                            episode_id = str(link['episode-id'])
                        onclick = str(link['onclick'])

                        key_gen = ''.join(
                            random.choice(string.ascii_lowercase +
                                          string.digits) for x in range(16))
                        ################# FIX FROM MUCKY DUCK & XUNITY TALK ################
                        key = '87wwxtp3dqii'
                        key2 = '7bcq9826avrbi6m49vd7shxkn985mhod'
                        cookie = hashlib.md5(episode_id +
                                             key).hexdigest() + '=%s' % key_gen
                        a = episode_id + key2
                        b = key_gen
                        i = b[-1]
                        h = b[:-1]
                        b = i + h + i + h + i + h
                        hash_id = uncensored(a, b)
                        ################# FIX FROM MUCKY DUCK & XUNITY TALK ################

                        serverurl = self.base_link + '/ajax/v2_get_sources/' + episode_id + '?hash=' + urllib.quote(
                            hash_id)
                        # print ("playurl ONEMOVIES", serverurl)

                        headers = {
                            'Accept-Language': 'en-US',
                            'Cookie': cookie,
                            'Referer': referer,
                            'User-Agent':
                            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
                            'X-Requested-With': 'XMLHttpRequest'
                        }
                        # print ("playurl ONEMOVIES", headers)
                        result = requests.get(serverurl,
                                              headers=headers).content
                        # print ("RESULT ONEMOVIES", result)
                        result = result.replace('\\', '')
                        # print ("ONEMOVIES Result", result)
                        url = re.findall('"?file"?\s*:\s*"(.+?)"', result)
                        url = [googletag(i) for i in url]
                        url = [i[0] for i in url if len(i) > 0]
                        u = []
                        try:
                            u += [[i for i in url
                                   if i['quality'] == '1080p'][0]]
                        except:
                            pass
                        try:
                            u += [[i for i in url if i['quality'] == '720'][0]]
                        except:
                            pass
                        try:
                            u += [[i for i in url if i['quality'] == '480'][0]]
                        except:
                            pass
                        url = replaceHTMLCodes(u[0]['url'])
                        quality = googletag(url)[0]['quality']

                        # print ("ONEMOVIES PLAY URL", quality, url)

                        sources.append({
                            'source': 'google video',
                            'quality': quality,
                            'scraper': self.name,
                            'url': url,
                            'direct': True
                        })
            except:
                pass

        except:
            pass
        return sources

Exemple #26

0

Afficher le fichier

Fichier : watchfree.py Projet : 24061993/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb,
                       tvdb):
        try:
            query = urlparse.urljoin(
                self.base_link, self.tvsearch_link %
                urllib.quote_plus(title.replace('\'', '').rsplit(':', 1)[0]))

            html = proxy.get(query, 'item')
            if 'page=2' in html or 'page%3D2' in html:
                html2 = proxy.get(query + '&page=2', 'item')
                html += html2

            html = BeautifulSoup(html)

            cleaned_title = 'watchputlocker' + clean_title(title)
            years = [
                '%s' % str(year),
                '%s' % str(int(year) + 1),
                '%s' % str(int(year) - 1)
            ]

            items = html.findAll('div', attrs={'class': 'item'})

            show_url = None
            for item in items:
                links = item.findAll('a')
                for link in links:
                    href = link['href']
                    link_title = link['title']
                    try:
                        href = urlparse.parse_qs(
                            urlparse.urlparse(href).query)['u'][0]
                    except:
                        pass
                    try:
                        href = urlparse.parse_qs(
                            urlparse.urlparse(href).query)['q'][0]
                    except:
                        pass
                    if cleaned_title == clean_title(
                            link_title) and show_year in link_title:
                        url = re.findall('(?://.+?|)(/.+)', href)[0]
                        show_url = urlparse.urljoin(self.base_link,
                                                    replaceHTMLCodes(url))
                    else:
                        continue

                    html = BeautifulSoup(proxy.get(show_url,
                                                   'tv_episode_item'))
                    season_items = html.findAll('div',
                                                attrs={'class': 'show_season'})
                    for season_item in season_items:
                        if season_item["data-id"] != season:
                            continue
                        episode_items = season_item.findAll(
                            'div', attrs={'class': 'tv_episode_item'})
                        for episode_item in episode_items:
                            link = episode_item.findAll('a')[-1]
                            href = link["href"]
                            link_episode = link.contents[0].strip()
                            if link_episode != "E%s" % (episode):
                                continue
                            link_airdate = link.findAll(
                                'span', attrs={'class':
                                               'tv_num_versions'})[-1]  # WTF
                            link_airdate = link_airdate.contents[0]
                            if any(candidate_year in link_airdate
                                   for candidate_year in years):
                                return self.sources(href)

        except:
            pass
        return []

Exemple #27

0

Afficher le fichier

Fichier : dizigold.py Projet : noobsandnerds/noobsandnerds

    def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb):

        url_title = title.replace(' ', '-').replace('.', '-').replace(":","").replace("!","").replace("?","").lower()
        episode_url = '/%s/%01d-sezon/%01d-bolum' % (url_title, int(season), int(episode))
        return self.sources(replaceHTMLCodes(episode_url))

Exemple #28

0

Afficher le fichier

Fichier : onemovies.py Projet : noobsandnerds/noobsandnerds

    def sources(self, url):
        sources = []
        try:
            # print("ONEMOVIES SOURCES", url)

            if url == None: return sources
            referer = url
            headers = {'User-Agent': random_agent()}
            url = url.replace('/watching.html', '')
            html = requests.get(url, headers=headers).content
            # print ("ONEMOVIES Source", html)
            try:
                url, episode = re.findall('(.+?)\?episode=(\d*)$', url)[0]
            except:
                episode = None
            vid_id = re.findall('-(\d+)', url)[-1]
            # print ("ONEMOVIES", vid_id)
            quality = re.findall('<span class="quality">(.*?)</span>', html)
            quality = str(quality)
            if quality == 'cam' or quality == 'ts':
                quality = 'CAM'
            elif quality == 'hd':
                quality = '720'
            else:
                quality = '480'
            try:
                headers = {'X-Requested-With': 'XMLHttpRequest'}
                headers['Referer'] = referer
                headers['User-Agent'] = random_agent()
                u = urlparse.urljoin(self.base_link, self.server_link % vid_id)
                # print("SERVERS", u)
                r = BeautifulSoup(requests.get(u, headers=headers).content)
                # print("SERVERS", r)
                containers = r.findAll('div', attrs={'class': 'les-content'})
                for result in containers:
                    links = result.findAll('a')
                    # print("ONEMOVIES", links)
                    for link in links:
                        title = str(link['title'])
                        # print("ONEMOVIES TITLE", title)
                        if not episode == None:
                            title = re.findall('Episode\s+(\d+):', title)[0]
                            title = '%01d' % int(title)
                            if title == episode:
                                episode_id = str(link['episode-id'])
                            # print("ONEMOVIES EPISODE", episode_id)
                            else:
                                continue

                        else:
                            episode_id = str(link['episode-id'])
                        onclick = str(link['onclick'])

                        key_gen = ''.join(random.choice(string.ascii_lowercase + string.digits) for x in range(16))
                        ################# FIX FROM MUCKY DUCK & XUNITY TALK ################
                        key = '87wwxtp3dqii'
                        key2 = '7bcq9826avrbi6m49vd7shxkn985mhod'
                        cookie = hashlib.md5(episode_id + key).hexdigest() + '=%s' % key_gen
                        a = episode_id + key2
                        b = key_gen
                        i = b[-1]
                        h = b[:-1]
                        b = i + h + i + h + i + h
                        hash_id = uncensored(a, b)
                        ################# FIX FROM MUCKY DUCK & XUNITY TALK ################

                        serverurl = self.base_link + '/ajax/v2_get_sources/' + episode_id + '?hash=' + urllib.quote(
                            hash_id)
                        # print ("playurl ONEMOVIES", serverurl)

                        headers = {'Accept-Language': 'en-US', 'Cookie': cookie, 'Referer': referer,
                                   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
                                   'X-Requested-With': 'XMLHttpRequest'}
                        # print ("playurl ONEMOVIES", headers)
                        result = requests.get(serverurl, headers=headers).content
                        # print ("RESULT ONEMOVIES", result)
                        result = result.replace('\\', '')
                        # print ("ONEMOVIES Result", result)
                        url = re.findall('"?file"?\s*:\s*"(.+?)"', result)
                        url = [googletag(i) for i in url]
                        url = [i[0] for i in url if len(i) > 0]
                        u = []
                        try:
                            u += [[i for i in url if i['quality'] == '1080p'][0]]
                        except:
                            pass
                        try:
                            u += [[i for i in url if i['quality'] == '720'][0]]
                        except:
                            pass
                        try:
                            u += [[i for i in url if i['quality'] == '480'][0]]
                        except:
                            pass
                        url = replaceHTMLCodes(u[0]['url'])
                        quality = googletag(url)[0]['quality']

                        # print ("ONEMOVIES PLAY URL", quality, url)

                        sources.append({'source': 'google video', 'quality': quality, 'scraper': self.name, 'url': url,
                                        'direct': True})
            except:
                pass

        except:
            pass
        return sources

Exemple #29

0

Afficher le fichier

Fichier : pubfilm.py Projet : 24061993/noobsandnerds

    def sources(self, url):
        sources = []
        try:
            if url == None: return sources

            if not self.base_link in url:
                url = urlparse.urljoin(self.base_link, url)

            content = re.compile('(.+?)\?episode=\d*$').findall(url)
            video_type = 'movie' if len(content) == 0 else 'episode'

            try:
                url, episode = re.compile('(.+?)\?episode=(\d*)$').findall(
                    url)[0]
            except:
                pass

            headers = {'User-Agent': random_agent()}
            html = self.scraper.get(url, headers=headers, timeout=30).content

            try:
                compressedstream = StringIO.StringIO(html)
                html = gzip.GzipFile(fileobj=compressedstream).read()
                html = BeautifulSoup(html)
            except:
                html = BeautifulSoup(html)

            links = html.findAll('a', attrs={'target': 'EZWebPlayer'})
            for link in links:
                href = replaceHTMLCodes(link['href'])
                if not "get.php" in href:
                    continue

                if video_type == 'episode':
                    link_episode_number = re.compile('(\d+)').findall(
                        link.string)
                    if len(link_episode_number) > 0:
                        link_episode_number = link_episode_number[-1]
                        if not link_episode_number == '%01d' % int(episode):
                            continue

                referer = url
                headers = {'User-Agent': random_agent(), 'Referer': referer}
                html = self.scraper.get(href, headers=headers,
                                        timeout=30).content
                source = re.findall('sources\s*:\s*\[(.+?)\]', html)[0]
                files = re.findall(
                    '"file"\s*:\s*"(.+?)".+?"label"\s*:\s*"(.+?)"', source)
                if files:
                    quality_url_pairs = [{
                        'url': file[0],
                        'quality': file[1][:-1]
                    } for file in files]
                else:
                    files = re.findall('"file"\s*:\s*"(.+?)".+?}', source)
                    quality_url_pairs = [{
                        'url': file,
                        'quality': "SD"
                    } for file in files]

                for pair in quality_url_pairs:
                    sources.append({
                        'source': 'google video',
                        'quality': pair['quality'],
                        'scraper': self.name,
                        'url': pair['url'],
                        'direct': True
                    })
        except:
            pass

        return sources