def itermedias(chid, chids=None, adaptive=True): if not chids: chids = [chid] for chid in chids: url = domain + chid up = parse.urlparse(url) chid = up.path.split("/")[-1] subpage = htmlement.fromstring(net.http(url, referer=domain)) embedlink = subpage.find(".//iframe").get("src") embedpage = htmlement.fromstring(net.http(embedlink, referer=url)) script = embedpage.find(".//script[@id='v']") jsurl = "%s://%s/embed/%s" % (up.scheme, up.netloc, chid) data = {"e": 1, "id": script.get("data-i")} scode = net.http(jsurl, referer=embedlink, data=data, headers={"x-requested-with": "XMLHttpRequest"}, method="POST") url = None scode = scode.replace("-", "+") scode = scode.replace("_", "/") scode = scode[::-1] for suffix in ["", "=", "=="]: try: url = base64.b64decode(scode + suffix) break except Exception: continue if url: url = url.decode() yield net.hlsurl(url, headers={"referer": domain}, adaptive=adaptive)
def itermedias(ctvcid, ctvcids=None): if not ctvcids: ctvcids = [ctvcid] for ctvcid in ctvcids: u = domain + "/" + ctvcid iframe1 = htmlement.fromstring(net.http( u, referer=domain)).find(".//iframe").get("src") iframe2 = htmlement.fromstring(net.http( iframe1, referer=u)).find(".//iframe").get("src") src = net.http(iframe2, referer=iframe1) media = re.search( "file[\s\t]*?\:[\s\t]*?atob\((?:\"|\')(.+?)(?:\"|\')\)", src) if media: yield net.hlsurl(base64.b64decode(media.group(1)).decode(), headers={"referer": domain}) else: for script in htmlement.fromstring(src).iterfind(".//script"): if script.get("src") and "yayin" in script.get("src"): scriptsrc = net.http(script.get("src"), referer=domain) key = re.search(rgxkey, scriptsrc) if key: for link in re.findall(rgxlink, scriptsrc): if "anahtar" in link: link = net.absurl(link, script.get("src")) yield net.hlsurl(link + key.group(1), headers={"referer": domain}) break
def searchshows(self, keyword=None): with Browser(loadtimeout=0) as browser: browser.navigate(domain, validate=self.ispagevalid) browser.elem_setattr("value", "'%s'" % keyword, tag="input") browser.elem_call("submit", tag="form") browser.loadtimeout = 3 page = browser.html() self.scrapegrid(htmlement.fromstring(page)) if not len(self.items): redirect = re.search( "window\.location\s*?\=\s*?(?:\"|\')(.+?)(?:\"|\')", page) if redirect and "anime/" in redirect.group(1): url = net.absurl(redirect.group(1), domain) with Browser(loadtimeout=0) as browser: page = browser.navigate(url, domain, validate=self.ispagevalid) xpage = htmlement.fromstring(page) div = xpage.find(".//div[@class='table-responsive']/") title = div.find(".//tr[2]/td[3]").text img = net.absurl( div.find(".//div[@class='imaj']/.//img").get("data-src"), domain) imgid = re.search("([0-9]+)", img).group(1) art = {"icon": img, "thumb": img, "poster": img} url = imgid, art self.additem(title, url, art=art)
def iteratechannels(): entrypage = net.http(girisurl, cache=10) url = htmlement.fromstring(entrypage).findall( ".//div[@class='sites']/.//a")[0].get("href") xpage = htmlement.fromstring(net.http(url, cache=10)) links = xpage.findall(".//div[@class='channels']/.//div[@id='tab5']/.//a") for link in links: chname = tools.elementsrc(link.find(".//div[@class='name']"), exclude=[link.find(".//b")]).strip() yield url, link.get("data-url"), chname
def iteratechannels(): xpage = htmlement.fromstring(net.http(domain)) for ch in iterpage(xpage): yield ch pagination = xpage.findall(".//ul[@id='sayfalama']/.//a") lastpage = pagination[-1].get("href").split("/")[-1] for i in range(2, int(lastpage) + 1): xpage = htmlement.fromstring(net.http(domain + "/p/%s" % i)) for ch in iterpage(xpage): yield ch
def getshows(self, catargs=None): if catargs: with Browser() as browser: page = browser.navigate(catargs, referer=domain, validate=self.ispagevalid) self.scrapegrid(htmlement.fromstring(page))
def gettree(self, url): if url is None: url = "" return htmlement.fromstring( self.download(self.domain + url, encoding=self.encoding, referer=self.domain))
def extract_youtube(self, source): import htmlement import urlquick # The Element class isn't been exposed directly by the C implementation # So the type trick is needed here if isinstance(source, type(htmlement.Etree.Element(None))): video_elem = source else: # Tempeary method to extract video url from an embeded youtube video. if source.startswith("http://") or source.startswith("https://"): source = urlquick.get(source, max_age=0).text try: video_elem = htmlement.fromstring(source) except RuntimeError: # pragma: no cover return None # Search for all types of embeded videos video_urls = [] video_urls.extend(video_elem.findall(".//iframe[@src]")) video_urls.extend(video_elem.findall(".//embed[@src]")) for url in video_urls: match = re.match(VALID_YOUTUBE_URL, url.get("src")) if match is not None: # pragma: no branch videoid = match.group(2) return u"plugin://plugin.video.youtube/play/?video_id={}".format( videoid)
def list_subcategories(plugin, subcategory, **kwargs): """Build subcateory listing""" ina_html = urlquick.get(URL_ROOT).text.encode('utf-8') ina = htmlement.fromstring(ina_html) if subcategory == 'themes': ina = ina.find('.//div[@class="menusThemes"]') for sub in ina.iterfind('.//a'): url = sub.get('href') label = sub.text.encode('utf-8') if url[-1] != '/' or label == 'Voir tout': continue item = Listitem() item.label = label item.set_callback(list_subsubcategories, url=URL_ROOT + url) yield item elif subcategory == 'dossiers': ina = ina.find('.//div[@class="secondary-nav__dossiers"]') for sub in ina.iterfind('.//a'): url = sub.get('href') label = sub.text.encode('utf-8') if url.count('/') != 3 or url[-1] != '/' or label == 'Voir tout': continue item = Listitem() item.label = label item.set_callback(list_types, url=URL_ROOT + url) yield item
def iterprogrammes(): u = "https://www.ssport.tv/yayin-akisi" pagex = htmlement.fromstring(net.http(u)) prename = predate = None for day in pagex.iterfind('.//ul[@id="switcher-day-s-sport-2"]/li'): datadate = day.get("data-date") if datadate is not None: curmatch = re.search("([0-9]+)\s(.+?)\s", datadate) curd = int(curmatch.group(1)) curm = trmonmap[curmatch.group(2).lower().strip()] for prog in day.iterfind("./ul/li"): pdate = prog.find(".//time") pname = prog.find(".//h3") if pdate is not None and pname is not None: phour, pmin = pdate.get("datetime").split(":") pdate = datetime.datetime(day=curd, month=curm, year=now.year, hour=int(phour), minute=int(pmin), tzinfo=trtz) pname = pname.text.strip() if prename: yield programme(prename, predate, pdate) prename = pname predate = pdate
def extract_youtube(source): # pragma: no cover warnings.warn("This method was only temporary and will be removed in future release.", DeprecationWarning) # TODO: Remove this method now that youtube.dl works on kodi for Xbox # noinspection PyPackageRequirements import htmlement import urlquick # The Element class isn't been exposed directly by the C implementation # So the type trick is needed here if isinstance(source, type(htmlement.Etree.Element(None))): video_elem = source else: # Tempeary method to extract video url from an embeded youtube video. if source.startswith("http://") or source.startswith("https://"): source = urlquick.get(source, max_age=0).text try: video_elem = htmlement.fromstring(source) except RuntimeError: # pragma: no cover return None # Search for all types of embeded videos video_urls = [] video_urls.extend(video_elem.findall(".//iframe[@src]")) video_urls.extend(video_elem.findall(".//embed[@src]")) for url in video_urls: match = re.match(VALID_YOUTUBE_URL, url.get("src")) if match is not None: # pragma: no branch videoid = match.group(2) return u"plugin://plugin.video.youtube/play/?video_id={}".format(videoid)
def iterprogrammes(channame): prename = predate = None for i in range(len(suffixes)): pagex = htmlement.fromstring(net.http(url % (channame, suffixes[i]))) curtxt = pagex.find(".//a[%d]/div[@class='day-date']" % (i + 1)).text m1 = re.search("([0-9]+)\s(.+)", curtxt) curd = int(m1.group(1)) curm = trmonmap[m1.group(2).lower().strip()] for li in pagex.iterfind(".//div[@class='container']/div/ul/li"): ptime = li.find(".//strong") pname = li.find(".//p") if ptime is not None and pname is not None: phour, pmin = ptime.text.split(":") phour = int(phour) pmin = int(pmin) pname = pname.text.strip() if pname == "-": continue pdate = datetime.datetime(day=curd, month=curm, year=now.year, hour=phour, minute=pmin, tzinfo=trtz) if prename: yield programme(prename, predate, pdate) prename = pname predate = pdate
def iterelems(self, xpath, url=None, page=None, tree=None, cat=None): if not tree: if not page: page = self.download(url) tree = htmlement.fromstring(page) for elem in tree.iterfind(xpath): if cat: catid = elem.get("data-serie-category-id") if not catid == cat: continue img = elem.find(".//img") if img is not None: img = img.get("src") else: img = "DefaultFolder.png" title = elem.find(".//h3") if title is None: continue else: title = title.text info = {} art = {"icon": img, "thumb": img, "poster": img} if xpath.endswith("/a"): link = elem.get("href") else: link = elem.find(".//a").get("href") yield title, link, info, art
def test_fromstring(): # Check that I can parse a simple tree html = "<html><body></body></html>" root = htmlement.fromstring(html) assert Etree.iselement(root) assert root.tag == "html" assert root[0].tag == "body"
def resolve(self, url, headers): if "vidnext" in url: page = net.http(url, headers=headers) tree = htmlement.fromstring(page) iframe = tree.find(".//iframe[@id='embedvideo_main']") if iframe is not None: headers["referer"] = url url = iframe.get("src") else: up = parse.urlparse(url) jsq = dict(parse.parse_qsl(up.query)) jsurl = "https://%s/ajax.php" % up.netloc js = json.loads( net.http(jsurl, params=jsq, referer=url, headers={"x-requested-with": "XMLHttpRequest"})) for k in ["source", "source_bk"]: for vid in js.get(k, []): yield net.tokodiurl(vid["file"], headers={"referer": url}) up = parse.urlparse(url) if "movcloud.net" in url: vid = up.path.split("/")[-1] jsurl = "https://api.%s/stream/%s" % (up.netloc, vid) js = json.loads(net.http(jsurl, referer=url)) for vid in sorted(js["data"]["sources"], key=itemgetter("height"), reverse=True): yield net.tokodiurl(vid["file"], headers={"referer": url}) else: raise Exception("unknown url:%s" % url)
def list_alpha2(plugin, js_file, mode, range_elt, page=1, **kwargs): """Build categories listing after range choice""" params_l = [ 'order=asc', 'page=' + str(page), 'nbResults=48', 'mode=' + mode, 'range=' + range_elt ] url = URL_ROOT + '/blocs/rubrique_sommaire/' + js_file \ + '?' + '&'.join(params_l) list_categories_text = urlquick.get(url).text.encode('utf-8') list_categories_json = json.loads(list_categories_text) categories = htmlement.fromstring(list_categories_json["html"]) cnt = 0 for categroy in categories.iterfind(".//div[@class='media']"): cnt = cnt + 1 item = Listitem() item.label = categroy.find('.//img').get('alt') item.art['thumb'] = item.art['landscape'] = URL_ROOT + \ categroy.find('.//img').get('src') url = URL_ROOT + categroy.find('.//a').get('href') item.set_callback(list_types, url=url) item_post_treatment(item) yield item if cnt == 48: # More categories... yield Listitem.next_page(js_file=js_file, mode=mode, range_elt=range_elt, page=page + 1) elif cnt == 0: plugin.notify(plugin.localize(30718), '') yield False
def scraperesults(self, page, tree, query=None): for row in tree.findall(".//div[@class='nblock']/div/div[2]"): a = row.find(".//a") if a is None: continue link = a.get("href") name = a.get("title") years = row.findall(".//span") if len(years) > 1: ryear = re.search("([0-9]{4})", years[1].text) if ryear: year = int(ryear.group(1)) if len(years) <= 1 or not ryear: year = "-1" if norm(name) == norm(self.item.title) and \ (self.item.show or (self.ignoreyear or self.item.year is None or self.item.year == year)): self.found = True p = self.request(domain + link, referer=domain) e = htmlement.fromstring(p) self.scrapepage(p, e) break if query and not self.found: pages = tree.findall(".//div[@class='pagin']/a") for page in pages: if "sonra" in page.text.lower(): if self.found: break query = dict( urlparse.parse_qsl( urlparse.urlparse(page.get("href")).query)) self.scraperesults( self.request(domain + "/find.php", query, referer=domain))
def find(chname): xpage = htmlement.fromstring(net.http("%s/tv-kanallari" % domain, cache=60)) for channel in xpage.iterfind(".//a[@class='channel-card']"): div = channel.find(".//div[@class='name']") if div is not None and normalize( tools.elementsrc(div)) == normalize(chname): return channel.get("href")
def getcategories(self): tree = htmlement.fromstring(self.download(self.domain + "/programlar")) for cat in tree.iterfind( ".//ul[@class='category-list category-type']/li"): catid = cat.get("data-category-id") if catid == "0": continue self.additem(cat.text, catid)
def getcategories(self): u = "%sajax/turler" % domain with Browser() as browser: page = browser.navigate( u, domain, headers={"x-requested-with": "XMLHttpRequest"}) # page = self.download(u, headers=headers, referer=domain) xpage = htmlement.fromstring(page) for a in xpage.iterfind(".//a"): self.additem(a.get("title"), net.absurl(a.get("href"), domain))
def geturls(self, args): url, scrape = args if not scrape: yield url else: page = self.download(url, referer=domain) tree = htmlement.fromstring(page) for u in tree.findall(".//meta[@itemprop='embedUrl']"): yield u.get("content")
def find(self, query): q = {"cat": "sub", "find": query} page = self.request(domain + "/find.php", q, referer=domain) tree = htmlement.fromstring(page) title = tree.find(".//title") if "arama" in title.text.lower(): self.scraperesults(page, tree, q) else: self.scrapepage(page, tree)
def get_video_url(plugin, item_id, video_url, download_mode=False, **kwargs): """Get video URL and start video player""" video_html = urlquick.get(video_url).text root = htmlement.fromstring(video_html) iframe_src = URL_ROOT + root.find('.//iframe[@id="player-iframe"]').get( "src") player_html = urlquick.get(iframe_src).text video_url = re.search("source: '(.*?)'", player_html).group(1) return video_url
async def sukebei_search(query): async with aiohttp.ClientSession() as session: async with session.get(f'https://sukebei.nyaa.si/?page=rss&c=0_0&f=0&q={urlencode(query)}') as resp: root = htmlement.fromstring(await resp.text()) results = [] for i in root.iterfind('.//channel/item'): title = i.find('title').text link = i.find('link').tail results.append((title, link)) return results
def geturls(self, url): resolve, url = url if not resolve: for u in url: yield u if resolve: url = self.domain + url tree = htmlement.fromstring(self.download(url)) video = tree.find(".//video/source") if video is not None: yield tokodiurl(video.get("src"), headers={"Referer": url})
def getepisodes(self, showargs=None, seaargs=None): if showargs: aniid, art = showargs url = "%sajax/bolumler&animeId=%s" % (domain, aniid) with Browser() as browser: page = browser.navigate( url, domain, headers={"x-requested-with": "XMLHttpRequest"}) for a in htmlement.fromstring(page).iterfind(".//a"): href = a.get("href") if href and "/video/" in href: title = a.get("title") url = net.absurl(a.get("href"), domain) self.additem(title, url, art=art) else: with Browser() as browser: self.scrapegrid( htmlement.fromstring( browser.navigate(domain, None, self.ispagevalid)))
def _getrels(uname, rname): page = htmlement.fromstring( _page("https://%s/%s/%s/tags" % (_dom, uname, rname))) rels = page.findall(".//div[@class='Box']/div/div/div/ul") allrels = [] for rel in rels: links = rel.findall(".//a[@class='muted-link']") commit = links[0].get("href").split("/")[-1] zipu = links[1].get("href") zipu = "https://" + _dom + zipu allrels.append([commit, zipu]) return allrels
def render_component(self, component_template, context={}): initial_data = context.get("initial_data") if initial_data: del context["initial_data"] component_render = render_to_string(component_template, context=context) root = htmlement.fromstring(component_render).find("div") root.set("wire:id", self.id) if initial_data: root.set("wire:initial-data", json.dumps(initial_data)) res = ET.tostring(root) return mark_safe(smart_str(res))
def geturls(self, id): fansubxpath = ".//div[@class='panel-body']/div[1]/button" mirrorxpath = ".//div[@class='panel-body']/div[4]/button" with Browser() as browser: page = browser.navigate(id, domain, self.ispagevalid) xpage = htmlement.fromstring(page) fansubs = {} for fansub, fansublink in tools.safeiter( self.iterajaxlink(xpage, fansubxpath)): fansubs[fansub] = fansublink if not fansubs: for _, mirrorlink in tools.safeiter( self.iterajaxlink(xpage, mirrorxpath)): mirror = self.getlink(mirrorlink) if mirror: yield mirror else: fansubselect = gui.select("Select Fansub", list(fansubs.keys())) i = -1 for _fansub, fansublink in fansubs.items(): i += 1 if fansubselect == -1 or fansubselect == i: with Browser(None, 0) as browser: page = browser.navigate( fansublink, id, headers={"x-requested-with": "XMLHttpRequest"}) xfansubpage = htmlement.fromstring(page) mirror = self.getlink(None, xfansubpage) if mirror: yield mirror for _, mirrorlink in tools.safeiter( self.iterajaxlink(xfansubpage, mirrorxpath)): mirror = self.getlink(mirrorlink) if mirror: yield mirror
def cacheepisodes(self, url): info = {"plot": "", "plotoutline": ""} tree = htmlement.fromstring(self.download(self.domain + url)) title = tree.find('.//meta[@property="og:title"]') plot1 = tree.find(".//div[@class='detail-content']/p") plot2 = tree.find(".//h2[@class='detail-description']") if plot1 is not None: info["plot"] = info["plotoutline"] = plot1.text elif plot2 is not None: info["plot"] = info["plotoutline"] = plot2.text if title is not None: info["title"] = title.get("content") return info, {}