def check(self, instance): config_url = instance.get('url') added_tags = instance.get('tags') if config_url is None: raise Exception("An url must be specified") # Load basic authentication configuration, if available. username, password = instance.get('username'), instance.get('password') if username and password: auth = (username, password) else: auth = None # Support URLs that have a path in them from the config, for # backwards-compatibility. parsed = urlparse.urlparse(config_url) if parsed.path != "": config_url = "%s://%s" % (parsed.scheme, parsed.netloc) # Tag by URL so we can differentiate the metrics from multiple instances tags = ['url:%s' % config_url] if added_tags is not None: for tag in added_tags: tags.append(tag) # Load stats data. url = urlparse.urljoin(config_url, STATS_URL) stats_data = self._get_data(url, auth) self._process_stats_data(config_url, stats_data, auth, tags=tags) # Load the health data. url = urlparse.urljoin(config_url, HEALTH_URL) health_data = self._get_data(url, auth) self._process_health_data(config_url, health_data, tags=tags)
def search_by_build_id(hex_encoded_id): """ Given a hex-encoded Build ID, return the path to an ELF with that Build ID only the local system. If it can't be found, return None. """ cache = cache_dir + hex_encoded_id if os.path.exists(cache) and read(cache).startswith('\x7FELF'): log.info_once("Using cached data from %r" % cache) return cache log.info("Downloading data from GitHub") url_base = "https://gitlab.com/libcdb/libcdb/raw/master/hashes/build_id/" url = urlparse.urljoin(url_base, hex_encoded_id) data = "" while not data.startswith('\x7fELF'): data = wget(url) if not data: return None if data.startswith('..'): url = os.path.dirname(url) + '/' url = urlparse.urljoin(url, data) write(cache, data) return cache
def ogone_form_generate_values(self, cr, uid, id, values, context=None): base_url = self.pool['ir.config_parameter'].get_param(cr, uid, 'web.base.url') acquirer = self.browse(cr, uid, id, context=context) ogone_tx_values = dict(values) temp_ogone_tx_values = { 'PSPID': acquirer.ogone_pspid, 'ORDERID': values['reference'], 'AMOUNT': float_repr(float_round(values['amount'], 2) * 100, 0), 'CURRENCY': values['currency'] and values['currency'].name or '', 'LANGUAGE': values.get('partner_lang'), 'CN': values.get('partner_name'), 'EMAIL': values.get('partner_email'), 'OWNERZIP': values.get('partner_zip'), 'OWNERADDRESS': values.get('partner_address'), 'OWNERTOWN': values.get('partner_city'), 'OWNERCTY': values.get('partner_country') and values.get('partner_country').code or '', 'OWNERTELNO': values.get('partner_phone'), 'ACCEPTURL': '%s' % urlparse.urljoin(base_url, OgoneController._accept_url), 'DECLINEURL': '%s' % urlparse.urljoin(base_url, OgoneController._decline_url), 'EXCEPTIONURL': '%s' % urlparse.urljoin(base_url, OgoneController._exception_url), 'CANCELURL': '%s' % urlparse.urljoin(base_url, OgoneController._cancel_url), 'PARAMPLUS': 'return_url=%s' % ogone_tx_values.pop('return_url') if ogone_tx_values.get('return_url') else False, } if values.get('type') == 'form_save': temp_ogone_tx_values.update({ 'ALIAS': 'ODOO-NEW-ALIAS-%s' % time.time(), # something unique, 'ALIASUSAGE': values.get('alias_usage') or acquirer.ogone_alias_usage, }) shasign = self._ogone_generate_shasign(acquirer, 'in', temp_ogone_tx_values) temp_ogone_tx_values['SHASIGN'] = shasign ogone_tx_values.update(temp_ogone_tx_values) return ogone_tx_values
def get_sub(self): """Fetches the subtitles from addic7ed from url specified in given database (db) for that episode""" url_split = urlparse.urlsplit (self.url) head, tail = url_split.path.rsplit ('/', 1) new_path = head, 'addic7ed' referer = urlparse.urlunsplit(url_split._replace(path=urlparse.urljoin(*new_path))) domain = self.url response = urllib2.urlopen(domain)#Opens the url html = response.read ()#loads the html code soup = BeautifulSoup (html)#interprets (parse?) the html code links = [] for x in soup.find_all (class_ ="buttonDownload"): links.append (x.attrs['href']) domain = 'http://www.addic7ed.com/' urls = [] for link in links: urls.append (urlparse.urljoin (domain, link)) page = urls[0] req = urllib2.Request(page, headers ={'User-Agent' : 'Mozilla 5.10', 'Referer' : referer}) response = urllib2.urlopen (req) data = response.read() test = response.info() print test if response.info().has_key('Content-Disposition'): with open(os.path.join(self.db.env.subs_dir ,'%s.srt' % self.title), 'wb') as f: f.write(data) else: return response.info()
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources url = urlparse.urljoin(self.base_link, url) for i in range(3): result = client.request(url, timeout=10) if not result == None: break dom = dom_parser.parse_dom(result, 'div', attrs={'class':'links', 'id': 'noSubs'}) result = dom[0].content links = re.compile('<tr\s*>\s*<td><i\s+class="fa fa-youtube link-logo"></i>([^<]+).*?href="([^"]+)"\s+class="watch',re.DOTALL).findall(result) for link in links[:5]: try: url2 = urlparse.urljoin(self.base_link, link[1]) for i in range(2): result2 = client.request(url2, timeout=3) if not result2 == None: break r = re.compile('href="([^"]+)"\s+class="action-btn').findall(result2)[0] valid, hoster = source_utils.is_host_valid(r, hostDict) if not valid: continue urls, host, direct = source_utils.check_directstreams(r, hoster) for x in urls: sources.append({'source': host, 'quality': x['quality'], 'language': 'en', 'url': x['url'], 'direct': direct, 'debridonly': False}) except: #traceback.print_exc() pass return sources except: return sources
def novedades(item): logger.info("pelisalacarta.channels.animeflv novedades") # Descarga la pagina data = scrapertools.cache_page(item.url) # Extrae las entradas (carpetas) ''' <div class="not"> <a href="/ver/cyclops-shoujo-saipu-12.html" title="Cyclops Shoujo Saipu 12"> <img class="imglstsr lazy" src="http://cdn.animeflv.net/img/mini/957.jpg" border="0"> <span class="tit_ep"><span class="tit">Cyclops Shoujo Saipu 12</span></span> </a> ''' patronvideos = '<div class="not"[^<]+<a href="([^"]+)" title="([^"]+)"[^<]+<img class="[^"]+" src="([^"]+)"[^<]+' \ '<span class="tit_ep"><span class="tit">([^<]+)<' matches = re.compile(patronvideos, re.DOTALL).findall(data) itemlist = [] for match in matches: scrapedtitle = scrapertools.entityunescape(match[3]) fulltitle = scrapedtitle # directory = match[1] scrapedurl = urlparse.urljoin(item.url, match[0]) scrapedthumbnail = urlparse.urljoin(item.url, match[2].replace("mini", "portada")) scrapedplot = "" if DEBUG: logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]") itemlist.append(Item(channel=__channel__, action="findvideos", title=scrapedtitle, url=scrapedurl, thumbnail=scrapedthumbnail, plot=scrapedplot, fulltitle=fulltitle, viewmode="movie")) return itemlist
def handle_captcha(self, response, solver): sel = scrapy.Selector(response) iframe_src = sel.xpath(self.CAPTCHA_XPATH).extract()[0] iframe_url = urljoin(response.url, iframe_src) iframe_request = scrapy.Request(iframe_url) iframe_response = yield download(self.crawler, iframe_request) iframe_sel = scrapy.Selector(iframe_response) img_src, = iframe_sel.xpath('//img/@src').extract()[:1] or [None] if img_src is None: raise DecaptchaError('No //img/@src found on CAPTCHA page') img_url = urljoin(iframe_response.url, img_src) img_request = scrapy.Request(img_url) img_response = yield download(self.crawler, img_request) scrapy.log.msg('CAPTCHA image downloaded, solving') captcha_text = yield solver.solve(img_response.body) scrapy.log.msg('CAPTCHA solved: %s' % captcha_text) challenge_request = scrapy.FormRequest.from_response( iframe_response, formxpath='//form', formdata={'recaptcha_response_field': captcha_text} ) challenge_response = yield download(self.crawler, challenge_request) challenge_sel = scrapy.Selector(challenge_response) challenge, = challenge_sel.xpath( '//textarea/text()' ).extract()[:1] or [None] if not challenge: raise DecaptchaError('Bad challenge from reCAPTCHA API:\n%s' % challenge_response.body) scrapy.log.msg('CAPTCHA solved, submitting challenge') submit_request = scrapy.FormRequest.from_response( response, formxpath='//form[.%s]' % self.CAPTCHA_XPATH, formdata={'recaptcha_challenge_field': challenge} ) yield download(self.crawler, submit_request)
def parse(self, response): hxs = HtmlXPathSelector(response) for div in hxs.select('//div[@id="contem_boxes"]'): titulo = div.select('.//div[@id="contem_titulo"]/text()').extract()[0] if not titulo.endswith(u'mara dos Deputados/BR'): continue else: reg = re.compile('<a class="listapar" href="(?P<url>.*?)">(?P<name>[\w\s]*[\w]+)\s*\(<b>[\w\s]+</b>\)\s-\s(?P<party>.*?)\/(?P<state>.*?)</a><br>', flags=re.U) for r in reg.finditer(div.extract()): dict_deputy = r.groupdict() #if dict_deputy['state'] in settings['STATE_TO_FILTER']: db_deputy = self.api.get_deputado_por_nome(dict_deputy['name']) if not db_deputy: dep = Deputado(dict_deputy['name'], dict_deputy['state'], dict_deputy['party']) self.api.inserir_deputado(dep) else: dep = db_deputy[0] id = urlparse.parse_qs(urlparse.urlparse(dict_deputy['url']).query).get('id', [0])[0] if not id: continue request = Request(urljoin(self.base_url, '@presencas.php?id=%s' % id), callback=self.parse_deputy_assiduity) request.meta['dep'] = dep yield request request = Request(urljoin(self.base_url, '@uso_verbas_als.php?uf=16&id=%s' % id), callback=self.parse_deputy_costs) request.meta['dep'] = dep yield request
def getL10nRepositories(changesets, l10nRepoPath, relbranch=None): """Parses a list of locale names and revisions for their associated repository from the 'changesets' string passed in.""" # urljoin() will strip the last part of l10nRepoPath it doesn't end with # "/" if not l10nRepoPath.endswith('/'): l10nRepoPath = l10nRepoPath + '/' repositories = {} try: for locale, data in json.loads(changesets).iteritems(): locale = urljoin(l10nRepoPath, locale) repositories[locale] = { 'revision': data['revision'], 'relbranchOverride': relbranch, 'bumpFiles': [] } except (TypeError, ValueError): for locale, revision in parsePlainL10nChangesets(changesets).iteritems(): if revision == 'FIXME': raise Exception('Found FIXME in changesets for locale "%s"' % locale) locale = urljoin(l10nRepoPath, locale) repositories[locale] = { 'revision': revision, 'relbranchOverride': relbranch, 'bumpFiles': [] } return repositories
def peliscat(item): logger.info("[cinegratis.py] peliscat") url = item.url itemlist = [] itemlist.append( Item(channel=CHANNELNAME, action="listsimple" , title="Versión original" , url="http://www.cinegratis.net/index.php?module=search&title=subtitulado")) itemlist.append( Item(channel=CHANNELNAME, action="listsimple" , title="Versión latina" , url="http://www.cinegratis.net/index.php?module=search&title=latino")) # Descarga la página data = scrapertools.cachePage(url) # Extrae los items patronvideos = "<td align='left'><a href='([^']+)'><img src='([^']+)' border='0'></a></td>" matches = re.compile(patronvideos,re.DOTALL).findall(data) scrapertools.printMatches(matches) for match in matches: # Atributos patron2 = "genero/([A-Za-z\-]+)/" matches2 = re.compile(patron2,re.DOTALL).findall(match[0]) scrapertools.printMatches(matches2) scrapedtitle = matches2[0] scrapedurl = urlparse.urljoin(url,match[0]) scrapedthumbnail = urlparse.urljoin(url,match[1]) scrapedplot = "" if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]") itemlist.append( Item(channel=CHANNELNAME, action="listvideos" , title=scrapedtitle , url=scrapedurl, thumbnail=scrapedthumbnail, plot=scrapedplot)) return itemlist
def novedades(item): logger.info("[serieonline.py] novedades") # Descarga la página data = scrapertools.cachePage(item.url) # Extrae las entradas patronvideos = '<a href="([^"]+)" title="([^"]+)"><img src="([^"]+)" alt="([^"]+)" class="captify" /></a>' matches = re.compile(patronvideos,re.DOTALL).findall(data) if DEBUG: scrapertools.printMatches(matches) itemlist = [] for match in matches: scrapedtitle = match[1] + " " + match[3] scrapedplot = "" scrapedurl = urlparse.urljoin(item.url,match[0]) scrapedthumbnail = urlparse.urljoin(item.url,match[2]) if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]") # Añade al listado de XBMC itemlist.append( Item(channel=CHANNELNAME, action="findvideos", title=scrapedtitle , url=scrapedurl , thumbnail=scrapedthumbnail , plot=scrapedplot , folder=True) ) # Extrae el paginador patronvideos = '<div class="paginacion-num"><a href="([^"]+)">' matches = re.compile(patronvideos,re.DOTALL).findall(data) scrapertools.printMatches(matches) if len(matches)>0: scrapedtitle = "Página siguiente" scrapedurl = urlparse.urljoin(item.url,matches[0]) itemlist.append( Item(channel=CHANNELNAME, action="novedades", title=scrapedtitle , url=scrapedurl , folder=True) ) return itemlist
def search(item,texto): logger.info("[pelisalacarta.seriesblanco search texto="+texto) itemlist = [] item.url = urlparse.urljoin(host,"/search.php?q1=%s" % (texto)) data = scrapertools.cache_page(item.url) data = re.sub(r"\n|\r|\t|\s{2}| |<Br>|<BR>|<br>|<br/>|<br />|-\s","",data) data = re.sub(r"<!--.*?-->","",data) #<div style='float:left;width: 620px;'><div style='float:left;width: 33%;text-align:center;'><a href='/serie/20/against-the-wall.html' '><img class='ict' src='http://4.bp.blogspot.com/-LBERI18Cq-g/UTendDO7iNI/AAAAAAAAPrk/QGqjmfdDreQ/s320/Against_the_Wall_Seriesdanko.jpg' alt='Capitulos de: Against The Wall' height='184' width='120'></a><br><div style='text-align:center;line-height:20px;height:20px;'><a href='/serie/20/against-the-wall.html' style='font-size: 11px;'> Against The Wall</a></div><br><br> patron = "<img class='ict' src='([^']+)'.*?<div style='text-align:center;line-height:20px;height:20px;'><a href='([^']+)' style='font-size: 11px;'>([^<]+)</a>" matches = re.compile(patron,re.DOTALL).findall(data) for scrapedthumbnail, scrapedurl, scrapedtitle in matches: itemlist.append( Item(channel=__channel__, title =scrapedtitle , url=urlparse.urljoin(host,scrapedurl), action="episodios", thumbnail=scrapedthumbnail, fanart ="http://portfolio.vernier.se/files/2014/03/light-grey-wood-photography-hd-wallpaper-1920x1200-46471.jpg", show=scrapedtitle) ) try: return itemlist # Se captura la excepción, para no interrumpir al buscador global si un canal falla except: import sys for line in sys.exc_info(): logger.error( "%s" % line ) return []
def processJob(jobDetails): try: job = {} url = urljoin(rootUrl, jobDetails.a['href']) soup = thisInstitution.getSoup(url) subLinks = soup.select('.pinkbox_heading a') if subLinks: for link in subLinks: job['url'] = urljoin(rootUrl, link['href']) job['title'] = link.get_text() print job['title'] job["language"] = 'de' jobPage = thisInstitution.getSoup(job['url']) content = jobPage.find(id='contentblock') job['text'] = unicode(content) thisInstitution.addRecord(job) else: job['url'] = url job['title'] = jobDetails.a.get_text() print job['title'] job["language"] = 'de' content = soup.find(id='contentblock') job['text'] = unicode(content) thisInstitution.addRecord(job) except Exception as e: print e # record the error with the shared code and continue on to the next url thisInstitution.error(e.message, job) return False
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) page_html = self._http_get(url, cache_limit=.5) movie_id = dom_parser.parse_dom(page_html, 'div', {'id': 'media-player'}, 'movie-id') if movie_id: server_url = SL_URL % (movie_id[0]) url = urlparse.urljoin(self.base_url, server_url) html = self._http_get(url, cache_limit=.5) sources = {} for match in re.finditer('changeServer\(\s*(\d+)\s*,\s*(\d+)\s*\).*?class="btn-eps[^>]*>([^<]+)', html, re.DOTALL): link_type, link_id, q_str = match.groups() if link_type in ['12', '13', '14']: url = urlparse.urljoin(self.base_url, PLAYLIST_URL1 % (link_id)) sources.update(self.__get_link_from_json(url, q_str)) else: media_url = self.__get_ep_pl_url(link_type, page_html) if media_url: url = urlparse.urljoin(self.base_url, media_url) xml = self._http_get(url, cache_limit=.5) sources.update(self.__get_links_from_xml(xml, video)) for source in sources: if sources[source]['direct']: host = self._get_direct_hostname(source) else: host = urlparse.urlparse(source).hostname hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': sources[source]['quality'], 'views': None, 'rating': None, 'url': source, 'direct': sources[source]['direct']} hosters.append(hoster) return hosters
def __init__(self, layer, mapfile, fonts=None): """ Initialize Mapnik provider with layer and mapfile. XML mapfile keyword arg comes from TileStache config, and is an absolute path by the time it gets here. """ maphref = urljoin(layer.config.dirpath, mapfile) scheme, h, path, q, p, f = urlparse(maphref) if scheme in ('file', ''): self.mapfile = path else: self.mapfile = maphref self.layer = layer self.mapnik = None engine = mapnik.FontEngine.instance() if fonts: fontshref = urljoin(layer.config.dirpath, fonts) scheme, h, path, q, p, f = urlparse(fontshref) if scheme not in ('file', ''): raise Exception('Fonts from "%s" can\'t be used by Mapnik' % fontshref) for font in glob(path.rstrip('/') + '/*.ttf'): engine.register_font(str(font))
def get_favicon_url(url): if not url.startswith('http'): url = "http://{0}".format(url) # Check if the root location has a favicon before parsing for it if _has_root_favicon(url): return urlparse.urljoin(url, 'favicon.ico') headers = {'User-Agent': 'Mozilla/5.0'} request = urllib2.Request(url, None, headers) website = urllib2.urlopen(request).read() soup = BeautifulSoup(website) favicon_element = soup.find("link", rel="shortcut icon") if favicon_element: hostname = urlparse.urlparse(url).hostname favicon_url = favicon_element['href'] if favicon_url.startswith('//cdn'): return "http:" + favicon_url # favicon url is relative and must be converted to absolute path elif hostname not in favicon_url: return urlparse.urljoin(url, favicon_url) else: return favicon_url else: return None
def as_obi_serialization(self, request=None): """Produce an Open Badge Infrastructure serialization of this badge""" if request: base_url = request.build_absolute_uri('/') else: base_url = 'http://%s' % (Site.objects.get_current().domain,) # see: https://github.com/brianlovesdata/openbadges/wiki/Assertions if not self.creator: issuer = SITE_ISSUER else: issuer = { # TODO: Get from user profile instead? "origin": urljoin(base_url, self.creator.get_absolute_url()), "name": self.creator.username, "contact": self.creator.email } data = { # The version of the spec/hub this manifest is compatible with. Use # "0.5.0" for the beta. "version": OBI_VERSION, # TODO: truncate more intelligently "name": self.title[:128], # TODO: truncate more intelligently "description": self.description[:128], "criteria": urljoin(base_url, self.get_absolute_url()), "issuer": issuer } if self.image: data['image'] = urljoin(base_url, self.image.url) return data
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*movie_langs_list[^"]*'}) if fragment: for match in re.finditer('href="([^"]+)', fragment[0]): match = re.search('movie-player/(.*)', match.group(1)) if match: player_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1))) html = self._http_get(player_url, cache_limit=.5) match = re.search('<source\s+src="([^"]+)', html) if match: stream_url = match.group(1) hoster = {'multi-part': False, 'url': stream_url, 'class': self, 'quality': self._gv_get_quality(stream_url), 'host': self._get_direct_hostname(stream_url), 'rating': None, 'views': None, 'direct': True} hosters.append(hoster) fragment2 = dom_parser.parse_dom(html, 'ul', {'class': 'servers'}) if fragment2: for match in re.finditer('href="([^"]+).*?<span>(.*?)</span>', fragment2[0]): other_url, quality = match.groups() match = re.search('movie-player/(.*)', other_url) if match: other_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1))) if other_url == player_url: continue hoster = {'multi-part': False, 'url': other_url, 'class': self, 'quality': QUALITY_MAP.get(quality, QUALITIES.HD720), 'host': self._get_direct_hostname(other_url), 'rating': None, 'views': None, 'direct': True} hosters.append(hoster) return hosters
def search(self, video_type, title, year): results = [] norm_title = self._normalize_title(title) if video_type == VIDEO_TYPES.TVSHOW: for server_url in TVSHOW_URLS: for row in self.__parse_directory(self._http_get(server_url, cache_limit=48)): match_year = '' if norm_title in self._normalize_title(row['title']) and (not year or not match_year or year == match_year): result = {'url': urlparse.urljoin(server_url, row['link']), 'title': row['title'], 'year': match_year} results.append(result) else: search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) for article in dom_parser.parse_dom(html, 'article', {'class': 'entry-body'}): link = dom_parser.parse_dom(article, 'a', {'class': 'more-link'}, 'href') content = dom_parser.parse_dom(article, 'div', {'class': 'post-content'}) match = re.search('</a>\s*([^<]+)', content[0]) if content else '' info = dom_parser.parse_dom(article, 'div', {'class': 'post-info'}) is_movie = re.search('/category/movies/', info[0]) if info else False if match and link and is_movie: match_title_year = match.group(1) match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = {'url': self._pathify_url(link[0]), 'title': match_title, 'year': match_year} results.append(result) return results
def findVideoFrameLink(page, data): minheight = 300 minwidth = 300 frames = findFrames(data) if not frames: return None iframes = regexUtils.findall( data, "(frame(?![^>]*cbox\.ws)(?![^>]*Publi)(?![^>]*chat\d*\.\w+)(?![^>]*ad122m)(?![^>]*adshell)(?![^>]*capacanal)(?![^>]*blacktvlive\.com)[^>]*\sheight\s*=\s*[\"']*([\%\d]+)(?:px)?[\"']*[^>]*>)", ) if iframes: for iframe in iframes: if iframe[1] == "100%": height = minheight + 1 else: height = int(iframe[1]) if height > minheight: m = regexUtils.findall(iframe[0], "[\"' ]width\s*=\s*[\"']*(\d+[%]*)(?:px)?[\"']*") if m: if m[0] == "100%": width = minwidth + 1 else: width = int(m[0]) if width > minwidth: m = regexUtils.findall(iframe[0], "['\"\s]src=[\"']*\s*([^>\"' ]+)\s*[>\"']*") if m: return urlparse.urljoin(urllib.unquote(page), m[0]).strip() # Alternative 1 iframes = regexUtils.findall( data, '(frame(?![^>]*cbox\.ws)(?![^>]*capacanal)(?![^>]*blacktvlive\.com)[^>]*["; ]height:\s*(\d+)[^>]*>)' ) if iframes: for iframe in iframes: height = int(iframe[1]) if height > minheight: m = regexUtils.findall(iframe[0], '["; ]width:\s*(\d+)') if m: width = int(m[0]) if width > minwidth: m = regexUtils.findall(iframe[0], '["; ]src=["\']*\s*([^>"\' ]+)\s*[>"\']*') if m: return urlparse.urljoin(urllib.unquote(page), m[0]).strip() # Alternative 2 (Frameset) m = regexUtils.findall(data, '<FRAMESET[^>]+100%[^>]+>\s*<FRAME[^>]+src="([^"]+)"') if m: return urlparse.urljoin(urllib.unquote(page), m[0]).strip() m = regexUtils.findall( data, '<a href="([^"]+)" target="_blank"><img src="[^"]+" height="450" width="600" longdesc="[^"]+"/></a>' ) if m: return urlparse.urljoin(urllib.unquote(page), m[0]).strip() return None
def parse_list(self, response): hxs = HtmlXPathSelector(response) for href in hxs.select(r'//ul[@id="paper-listing"]//a/@href').extract(): yield Request(urlparse.urljoin(response.url, href), callback=self.parse_paper) next = hxs.select(r'//div[@class="pagination"]/ul/li[@class="next"]/a/@href') if len(next): yield Request(urlparse.urljoin(response.url, next[0].extract()), callback=self.parse_list)
def get_sources(self, url, hosthdDict, hostDict, locDict): try: sources = [] if url == None: return sources url = urlparse.urljoin(self.base_link, url) result = client.source(url) video_id = re.compile('video_id *= *[\'|\"](.+?)[\'|\"]').findall(result)[0] post = urllib.urlencode({'video_id': video_id}) result = client.source(urlparse.urljoin(self.base_link, self.info_link), post=post) u = [i for i in result.split('&') if 'google' in i][0] u = urllib.unquote_plus(u) u = [urllib.unquote_plus(i.split('|')[-1]) for i in u.split(',')] u = [googleplus.tag(i)[0] for i in u] u = [i for i in u if i['quality'] in ['1080p', 'HD']] for i in u: sources.append({'source': 'GVideo', 'quality': i['quality'], 'provider': 'Afdah', 'url': i['url']}) return sources except: return sources
def parse(self, response): self._logger.info("start response in parse -> response type:%s"%type(response).__name__) item_urls = [ urljoin(response.url, x) for x in list(set( response.xpath('//div[@id="resultsCol"]//div[@class="a-row a-spacing-none"]/a[@class="a-link-normal a-text-normal"]/@href').extract() )) ] self.crawler.stats.inc_total_pages(response.meta['crawlid'], response.meta['spiderid'], response.meta['appid'], len(item_urls)) for item_url in item_urls: yield Request(url=item_url, callback=self.parse_item, meta=response.meta) workers = response.meta.get('workers', {}) for worker in workers.keys(): workers[worker] = 0 if "if_next_page" in response.meta: del response.meta["if_next_page"] next_page_urls = [ urljoin(response.url, x) for x in list(set( response.xpath('//div[@id="pagn"]//span[@class="pagnRA"]/a/@href').extract() )) ] response.meta["if_next_page"] = True for next_page_url in next_page_urls: yield Request(url=next_page_url, callback=self.parse, meta=response.meta)
def mainlist(item): logger.info() thumb_series = get_thumb("squares", "thumb_canales_series.png") thumb_series_az = get_thumb("squares", "thumb_canales_series_az.png") thumb_buscar = get_thumb("squares", "thumb_buscar.png") itemlist = [] itemlist.append(Item(channel=item.channel, title="Listado alfabético", action="series_listado_alfabetico", thumbnail=thumb_series_az)) itemlist.append(Item(channel=item.channel, title="Todas las series", action="series", url=urlparse.urljoin(HOST, "listado/"), thumbnail=thumb_series)) itemlist.append(Item(channel=item.channel, title="Capítulos de estreno", action="homeSection", extra=CAPITULOS_DE_ESTRENO_STR, url=HOST , thumbnail=thumb_series)) itemlist.append(Item(channel=item.channel, title="Último actualizado", action="homeSection", extra="Último Actualizado", url=HOST , thumbnail=thumb_series)) itemlist.append(Item(channel=item.channel, title="Series más vistas", action="homeSection", extra="Series Más vistas", url=HOST , thumbnail=thumb_series)) itemlist.append(Item(channel=item.channel, title="Series menos vistas", action="homeSection", extra="Series Menos vistas", url=HOST , thumbnail=thumb_series)) itemlist.append(Item(channel=item.channel, title="Últimas fichas creadas", action="series", url=urlparse.urljoin(HOST, "fichas_creadas/"), thumbnail=thumb_series)) itemlist.append(Item(channel=item.channel, title="Buscar...", action="search", url=HOST, thumbnail=thumb_buscar)) if filtertools.context: itemlist = filtertools.show_option(itemlist, item.channel, list_idiomas, CALIDADES) return itemlist
def parseImgLinks(self,depth=1): url_response = None try: url_response = urllib2.urlopen(self.scrap_url,timeout=self._timeout) except Exception as e: print(" [ERROR]: Could not open {0}: {1}".format(self.scrap_url,e.reason)) return self.img_list html_parse = BeautifulSoup(url_response) unique_images_found = 0 total_images_found = 0 self.visited[self.scrap_url] = 1 for img in html_parse.findAll('img'): try: abs_url = urljoin(self.scrap_url,img['src']) if urlparse(img['src']).netloc == "" else img['src'] if abs_url not in self.img_list: self.img_list.add(abs_url) unique_images_found += 1 total_images_found += 1 except: pass print(" [Found %d images / %d new]: %s" % (total_images_found,unique_images_found,self.scrap_url)) if depth > 1: for a in html_parse.findAll('a'): try: if (urlparse(a['href']).netloc == "") or (urlparse(self.scrape_url_orig).netloc == urlparse(a['href']).netloc): self.scrap_url = urljoin(self.scrape_url_orig,a['href']) if self.scrap_url in self.visited: continue self.parseImgLinks(depth - 1) except: pass return self.img_list
def choose_reference(experiment, biorep_n, server, keypair, sex_specific): replicates = [common.encoded_get(urlparse.urljoin(server,rep_uri), keypair, frame='embedded') for rep_uri in experiment['replicates']] replicate = next(rep for rep in replicates if rep.get('biological_replicate_number') == biorep_n) logging.debug('Replicate uuid %s' %(replicate.get('uuid'))) organism_uri = replicate.get('library').get('biosample').get('organism') organism_obj = common.encoded_get(urlparse.urljoin(server,organism_uri), keypair) try: organism_name = organism_obj['name'] except: logging.error('%s:rep%d Cannot determine organism.' %(experiment.get('accession'), biorep_n)) raise return None else: logging.debug("Organism name %s" %(organism_name)) if sex_specific: try: sex = replicate.get('library').get('biosample').get('sex') assert sex in ['male', 'female'] except: logging.warning('%s:rep%d Sex is %s. Mapping to male reference.' %(experiment.get('accession'), biorep_n, sex)) sex = 'male' logging.debug('Organism %s sex %s' %(organism_name, sex)) else: sex = 'male' genome_assembly = args.assembly reference = next((ref.get('file') for ref in REFERENCES if ref.get('organism') == organism_name and ref.get('sex') == sex and ref.get('assembly') == genome_assembly), None) logging.debug('Found reference %s' %(reference)) return reference
def episodios(item): logger.info("{0} - {1}".format(item.title, item.url)) itemlist = [] # Descarga la página data = scrapertools.cache_page(item.url) fanart = scrapertools.find_single_match(data, "background-image[^'\"]+['\"]([^'\"]+)") plot = scrapertools.find_single_match(data, "id=['\"]profile2['\"]>\s*(.*?)\s*</div>") logger.debug("fanart: {0}".format(fanart)) logger.debug("plot: {0}".format(plot)) episodes = re.findall("<tr.*?href=['\"](?P<url>[^'\"]+).+?>(?P<title>.+?)</a>.*?<td>(?P<flags>.*?)</td>", data, re.MULTILINE | re.DOTALL) for url, title, flags in episodes: idiomas = " ".join(["[{0}]".format(IDIOMAS.get(language, "OVOS")) for language in re.findall("banderas/([^\.]+)", flags, re.MULTILINE)]) displayTitle = "{show} - {title} {languages}".format(show = item.show, title = title, languages = idiomas) logger.debug("Episode found {0}: {1}".format(displayTitle, urlparse.urljoin(HOST, url))) itemlist.append(item.clone(title=displayTitle, url=urlparse.urljoin(HOST, url), action="findvideos", plot=plot, fanart=fanart, language=idiomas, list_idiomas=list_idiomas, list_calidad=CALIDADES, context=filtertools.context)) if len(itemlist) > 0 and filtertools.context: itemlist = filtertools.get_links(itemlist, item.channel) if config.get_library_support() and len(itemlist) > 0: itemlist.append(item.clone(title="Añadir esta serie a la biblioteca", action="add_serie_to_library", extra="episodios")) return itemlist
def __search(self, titles, type, year, season=0, episode=False): try: years = [str(year), str(int(year) + 1), str(int(year) - 1)] years = ['&veroeffentlichung[]=%s' % i for i in years] query = self.search_link % (type, urllib.quote_plus(cleantitle.query(titles[0]))) query += ''.join(years) query = urlparse.urljoin(self.base_link, query) t = [cleantitle.get(i) for i in set(titles) if i] r = self.__proceed_search(query) r = [i[0] for i in r if cleantitle.get(i[1]) in t and int(i[2]) == int(season)][0] url = source_utils.strip_domain(r) if episode: r = client.request(urlparse.urljoin(self.base_link, url)) r = dom_parser.parse_dom(r, 'div', attrs={'class': 'season-list'}) r = dom_parser.parse_dom(r, 'li') r = dom_parser.parse_dom(r, 'a', req='href') r = [i.attrs['href'] for i in r if i and int(i.content) == int(episode)][0] url = source_utils.strip_domain(r) return url except: return
def check_page(self, page): self.marionette.navigate(urlparse.urljoin(self.server_prefix, page)) try: self.marionette.find_element("id", 'complete') except NoSuchElementException: fullPageUrl = urlparse.urljoin(self.relPath, page) details = "%s: 1 failure encountered\n%s" % \ (fullPageUrl, self.get_failure_summary( fullPageUrl, "Waiting for Completion", "Could not find the test complete indicator")) raise AssertionError(details) fail_node = self.marionette.find_element("css selector", '.failures > em') if fail_node.text == "0": return # This may want to be in a more general place triggerable by an env # var some day if it ends up being something we need often: # # If you have browser-based unit tests which work when loaded manually # but not from marionette, uncomment the two lines below to break # on failing tests, so that the browsers won't be torn down, and you # can use the browser debugging facilities to see what's going on. #from ipdb import set_trace #set_trace() raise AssertionError(self.get_failure_details(page))
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=.25) match = re.search('''<option[^>]+value\s*=\s*["']([^"']+)[^>]*>(?:Altyaz.{1,3}s.{1,3}z)<''', html) if match: option_url = urlparse.urljoin(self.base_url, match.group(1)) html = self._http_get(option_url, cache_limit=.25) fragment = dom_parser.parse_dom(html, 'span', {'class': 'object-wrapper'}) if fragment: iframe_url = dom_parser.parse_dom(fragment[0], 'iframe', ret='src') if iframe_url: html = self._http_get(iframe_url[0], cache_limit=.25) seen_urls = {} for match in re.finditer('"?file"?\s*:\s*"([^"]+)"\s*,\s*"?label"?\s*:\s*"(\d+)p?[^"]*"', html): stream_url, height = match.groups() if stream_url not in seen_urls: seen_urls[stream_url] = True stream_url += '|User-Agent=%s' % (scraper_utils.get_ua()) host = self._get_direct_hostname(stream_url) if host == 'gvideo': quality = scraper_utils.gv_get_quality(stream_url) else: quality = scraper_utils.height_get_quality(height) hoster = {'multi-part': False, 'host': self._get_direct_hostname(stream_url), 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True} hosters.append(hoster) return hosters
def _getFullUrl(url): if self.cm.isValidUrl(url): return url else: return urljoin(baseUrl, url)
def generate_context(self): """change the context""" # return the list of files to use files = self.get_files(self.path, exclude=[ 'pages', ]) all_articles = [] for f in files: content, metadata = read_file(f) # if no category is set, use the name of the path as a category if 'category' not in metadata.keys(): if os.path.dirname(f) == self.path: category = self.settings['DEFAULT_CATEGORY'] else: category = os.path.basename(os.path.dirname(f)) if category != '': metadata['category'] = unicode(category) if 'date' not in metadata.keys()\ and self.settings['FALLBACK_ON_FS_DATE']: metadata['date'] = datetime.fromtimestamp(os.stat(f).st_ctime) article = Article(content, metadata, settings=self.settings, filename=f) if not is_valid_content(article, f): continue add_to_url = u'' if 'ARTICLE_PERMALINK_STRUCTURE' in self.settings: article_permalink_structure = self.settings[ 'ARTICLE_PERMALINK_STRUCTURE'] article_permalink_structure = article_permalink_structure.lstrip( '/') # try to substitute any python datetime directive add_to_url = article.date.strftime(article_permalink_structure) # try to substitute any article metadata in rest file add_to_url = add_to_url % article.__dict__ add_to_url = [slugify(i) for i in add_to_url.split('/')] add_to_url = os.path.join(*add_to_url) article.url = urlparse.urljoin(add_to_url, article.url) article.save_as = urlparse.urljoin(add_to_url, article.save_as) if article.status == "published": if hasattr(article, 'tags'): for tag in article.tags: self.tags[tag].append(article) all_articles.append(article) elif article.status == "draft": self.drafts.append(article) self.articles, self.translations = process_translations(all_articles) for article in self.articles: # only main articles are listed in categories, not translations self.categories[article.category].append(article) self.authors[article.author].append(article) # sort the articles by date self.articles.sort(key=attrgetter('date'), reverse=True) self.dates = list(self.articles) self.dates.sort(key=attrgetter('date'), reverse=self.context['REVERSE_ARCHIVE_ORDER']) # create tag cloud tag_cloud = defaultdict(int) for article in self.articles: for tag in getattr(article, 'tags', []): tag_cloud[tag] += 1 tag_cloud = sorted(tag_cloud.items(), key=itemgetter(1), reverse=True) tag_cloud = tag_cloud[:self.settings.get('TAG_CLOUD_MAX_ITEMS')] tags = map(itemgetter(1), tag_cloud) if tags: max_count = max(tags) steps = self.settings.get('TAG_CLOUD_STEPS') # calculate word sizes self.tag_cloud = [ (tag, int( math.floor(steps - (steps - 1) * math.log(count) / (math.log(max_count) or 1)))) for tag, count in tag_cloud ] # put words in chaos random.shuffle(self.tag_cloud) # and generate the output :) # order the categories per name self.categories = list(self.categories.items()) self.categories.sort( reverse=self.settings.get('REVERSE_CATEGORY_ORDER')) self.authors = list(self.authors.items()) self.authors.sort() self._update_context(('articles', 'dates', 'tags', 'categories', 'tag_cloud', 'authors'))
def get_url(self, cmd, **args): cmd_path = cmd if not args else cmd + '?{0}'.format(urlencode(args)) return self.location(urljoin(Importer.api_base, cmd_path))[0]
def redirect(url): newloc = urlparse.urljoin(context.home + context.path, url) context.status = '301 Moved Permanently' header('Content-Type', 'text/html') header('Location', newloc)
def get(self, cdnRelativePath, relative=True): url = urlparse.urljoin(self._cdnRootUrl, cdnRelativePath) return super(CdnResourcesCache, self).get(url, relative=relative)
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources if debrid.status() == False: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year'] query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % urllib.quote_plus(query) url = urlparse.urljoin(self.base_link, url) scraper = cfscrape.create_scraper() r = scraper.get(url).content posts = client.parseDOM(r, 'item') hostDict = hostprDict + hostDict print posts items = [] for post in posts: try: print post items += zip(client.parseDOM(post, 'title'), client.parseDOM(post, 'link')) except: pass items = [(i[0], i[1]) for i in items if data['year'] in i[0]] print items[:1] for item in items: try: name = item[0] name = client.replaceHTMLCodes(name) t = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', name) if not cleantitle.get(t) == cleantitle.get(title): raise Exception() y = re.findall('[\.|\(|\[|\s](\d{4}|S\d*E\d*|S\d*)[\.|\)|\]|\s]', name)[-1].upper() if not y == hdlr: raise Exception() fmt = re.sub('(.+)(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*)(\.|\)|\]|\s)', '', name.upper()) fmt = re.split('\.|\(|\)|\[|\]|\s|\-', fmt) fmt = [i.lower() for i in fmt] if any(i.endswith(('subs', 'sub', 'dubbed', 'dub')) for i in fmt): raise Exception() if any(i in ['extras'] for i in fmt): raise Exception() if '1080p' in fmt: quality = '1080p' elif '720p' in fmt: quality = 'HD' else: quality = 'SD' if any(i in ['dvdscr', 'r5', 'r6'] for i in fmt): quality = 'SCR' elif any(i in ['camrip', 'tsrip', 'hdcam', 'hdts', 'dvdcam', 'dvdts', 'cam', 'telesync', 'ts'] for i in fmt): quality = 'CAM' info = [] if '3d' in fmt: info.append('3D') try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', name)[-1] div = 1 if size.endswith(' GB') else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size))/div size = '%.2f GB' % size info.append(size) except: pass if any(i in ['hevc', 'h265', 'x265'] for i in fmt): info.append('HEVC') info = ' | '.join(info) url = item[1] if any(x in url for x in ['.rar', '.zip', '.iso']): raise Exception() url = client.replaceHTMLCodes(url) url = url.encode('utf-8') host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0] if not host in hostDict: raise Exception() host = client.replaceHTMLCodes(host) host = host.encode('utf-8') sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True}) except: pass return sources except: return sources
def main(): print 'sending requests to %s' % BASE_URL # DELETE map map_url = urljoin(BASE_URL, '/maps;ayesha:nursery-rhymes') headers = {'Authorization': 'Bearer %s' % TOKEN1} r = requests.delete(map_url, headers=headers) if r.status_code == 200: print 'deleted test map %s etag: %s' % (r.headers['Content-Location'], r.headers['etag']) elif r.status_code == 404: print 'test map was not there %s' % (map_url) else: print 'failed to delete map %s %s %s' % (map_url, r.status_code, r.text) return # Make sure the permissions exist for the test Org org_url = '/v1/o/ayesha' permissions = { '_subject': org_url, '_permissions': { 'read': [USER1], 'update': [USER1], 'delete': [USER1] }, '_self': { 'read': [USER1], 'delete': [USER1], 'update': [USER1], 'create': [USER1] }, 'maps': { 'read': [USER1], 'delete': [USER1], 'create': [USER1] } } permissons_url = urljoin(BASE_URL, '/permissions') headers = {'Authorization': 'Bearer %s' % TOKEN1, 'Content-Type': 'application/json'} r = requests.post(permissons_url, headers=headers, json=permissions) if r.status_code == 201: print 'correctly created permissions for org %s etag: %s' % (r.headers['Location'], r.headers['etag']) elif r.status_code == 409: print 'correctly saw that permissions for org %s already exist' % (org_url) else: print 'failed to create map %s %s %s' % (maps_url, r.status_code, r.text) return # Create map using POST map = { 'isA': 'Map', 'org': '/v1/o/ayesha', 'name': 'nursery-rhymes', 'test-data': True } maps_url = urljoin(BASE_URL, '/maps') headers = {'Content-Type': 'application/json','Authorization': 'Bearer %s' % TOKEN1} r = requests.post(maps_url, headers=headers, json=map) if r.status_code == 201: print 'correctly created map %s etag: %s' % (r.headers['Location'], r.headers['etag']) map_url = urljoin(BASE_URL, r.headers['Location']) print 'text:', type(r.text) map_entries = urljoin(BASE_URL, r.json()['entries']) else: print 'failed to create map %s %s %s' % (maps_url, r.status_code, r.text) return # GET Map headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1} r = requests.get(map_url, headers=headers, json=map) if r.status_code == 200: map_url2 = urljoin(BASE_URL, r.headers['Content-Location']) if map_url == map_url2: map = r.json() print 'correctly retrieved map: %s etag: %s' % (map_url, r.headers['etag']) else: print 'retrieved map at %s but Content-Location is wrong: %s' % (map_url, map_url2) return else: print 'failed to retrieve map %s %s %s' % (map_url, r.status_code, r.text) return # POST entry for Humpty Dumpty entry = { 'isA': 'MapEntry', 'key': 'HumptyDumpty', 'test-data': True } entries_url = urljoin(BASE_URL, map['entries']) headers = {'Content-Type': 'application/json','Authorization': 'Bearer %s' % TOKEN1} r = requests.post(entries_url, headers=headers, json=entry) if r.status_code == 201: entry_url = urljoin(BASE_URL, r.headers['Location']) value_ref = urljoin(BASE_URL, r.json()['value']) print 'correctly created entry: %s value: %s map: %s etag: %s' % (entry_url, value_ref, urljoin(BASE_URL, r.json()['map']), r.headers['etag']) else: print 'failed to create entry %s %s %s' % (entries_url, r.status_code, r.text) return # GET entry for Humpty Dumpty headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1} r = requests.get(entry_url, headers=headers) if r.status_code == 200: value_ref = urljoin(BASE_URL, r.json()['value']) print 'correctly retrieved entry: %s value: %s map: %s etag: %s' % (entry_url, value_ref, urljoin(BASE_URL, r.json()['map']), r.headers['etag']) else: print 'failed to retrieve entry %s %s %s' % (entry_url, r.status_code, r.text) return # PUT value for HumptyDumpty headers = {'Content-Type': 'text/plain','Authorization': 'Bearer %s' % TOKEN1} r = requests.put(value_ref, headers=headers, data='Humpty Dumpty Sat on a wall') if r.status_code == 200: loc = r.headers['Content-Location'] print 'correctly created value: %s etag: %s' % (loc, r.headers['etag']) value_url = urljoin(BASE_URL, r.headers['Content-Location']) else: print 'failed to create value %s %s %s' % (value_ref, r.status_code, r.text) return # PUT value for LittleMissMuffet headers = {'Content-Type': 'text/plain','Authorization': 'Bearer %s' % TOKEN1} value_ref2 = '%s/entries;%s/value' % (map_url, 'LittleMissMuffet') r = requests.put(value_ref2, headers=headers, data='Little Miss Muffet\nSat on a tuffet') if r.status_code == 200: loc = r.headers['Content-Location'] print 'correctly created value: %s etag: %s' % (loc, r.headers['etag']) value_url = urljoin(BASE_URL, loc) else: print 'failed to create value %s %s %s' % (value_ref2, r.status_code, r.text) return # GET entry for LittleMissMuffet entry_ref2 = '%s/entries;%s' % (map_url, 'LittleMissMuffet') headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1} r = requests.get(entry_ref2, headers=headers) if r.status_code == 200: value_ref = urljoin(BASE_URL, r.json()['value']) assert(value_ref == value_url) print 'correctly retrieved entry: %s value: %s map: %s etag: %s' % (entry_ref2, value_ref, urljoin(BASE_URL, r.json()['map']), r.headers['etag']) else: print 'failed to retrieve entry %s %s %s' % (entry_ref2, r.status_code, r.text) return # GET value for LittleMissMuffet headers = {'Authorization': 'Bearer %s' % TOKEN1} r = requests.get(value_ref2, headers=headers) if r.status_code == 200: loc = r.headers['Content-Location'] print 'correctly got value at %s length: %s etag: %s text: %s' % (loc, len(r.text), r.headers['etag'], r.text) else: print 'failed to get value %s %s %s' % (value_ref2, r.status_code, r.text) return # GET all entries for map headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1} r = requests.get(map_entries, headers=headers, json=map) if r.status_code == 200: print 'correctly retrieved map entries: %s' % map_url else: print 'failed to retrieve map entries %s %s %s' % (map_url, r.status_code, r.text) return # GET map by name name_url = urljoin(BASE_URL, '/maps;ayesha:nursery-rhymes') headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1} r = requests.get(name_url, headers=headers, json=map) if r.status_code == 200: print 'correctly retrieved map by name: %s etag: %s' % (name_url, r.headers['etag']) else: print 'failed to retrieve map by name %s %s %s' % (name_url, r.status_code, r.text) return map = { 'isA': 'Map', 'name': 'nursery-rhymes', 'org': '/v1/o/ayesha', 'test-data': True } # Create map with duplicate name headers = {'Content-Type': 'application/json','Authorization': 'Bearer %s' % TOKEN1} r = requests.post(maps_url, headers=headers, json=map) if r.status_code == 409: print 'correctly refused to create map with duplicate name %s' % (r.text) else: print 'failed to reject map with duplicate name %s %s %s' % (maps_url, r.status_code, r.text) return # GET entries by map name entries_url = urljoin(BASE_URL, '/maps;ayesha:nursery-rhymes/entries') headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1} r = requests.get(entries_url, headers=headers, json=map) if r.status_code == 200: entries = r.json() if 'contents' in entries and isinstance(entries['contents'], list): print 'correctly retrieved map entries by name: %s' % (r.headers['Content-Location']) else: print 'wrong return type for map entries by name: %s type: %s' % (r.headers['Content-Location'], type(entries['contents'])) else: print 'failed to retrieve map entries by name %s %s %s' % (entries_url, r.status_code, r.text) return # GET entry by map name and key entry_url = urljoin(BASE_URL, '/maps;ayesha:nursery-rhymes/entries;HumptyDumpty') headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1} r = requests.get(entry_url, headers=headers, json=map) if r.status_code == 200: print 'correctly retrieved map entry by name from map by name: %s returned: %s' % (entry_url, r.headers['Content-Location']) else: print 'failed to retrieve map entry by name from map by name %s %s %s' % (entry_url, r.status_code, r.text) return # GET value by map name and key value_url = urljoin(BASE_URL, '/maps;ayesha:nursery-rhymes/entries;HumptyDumpty/value') headers = {'Authorization': 'Bearer %s' % TOKEN1} r = requests.get(value_url, headers=headers, json=map) if r.status_code == 200: print 'correctly retrieved value from map entry by name: %s at: %s text: %s' % (value_url, r.headers['Content-Location'], r.text) else: print 'failed to retrieve value from map entry by name %s %s %s' % (value_url, r.status_code, r.text) return # DELETE map headers = {'Authorization': 'Bearer %s' % TOKEN1} r = requests.delete(map_url, headers=headers) if r.status_code == 200: print 'correctly deleted map %s etag: %s' % (r.headers['Content-Location'], r.headers['etag']) else: print 'failed to delete map %s %s %s' % (maps_url, r.status_code, r.text) return
def getTokenEndpoint(self): if not self.url: raise Exception("oauth url error", self.url) return urljoin(self.url, self.OAUTH2_PATH['token'])
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() if not (self.api and not self.api == ''): raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None query = '%s S%02dE%02d' % (title, season, episode) if 'tvshowtitle' in data else '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) query += ' lang:%s' % self.language[0] query = urllib.quote_plus(query) url = urlparse.urljoin(self.base_link, self.search_link) hostDict = hostprDict + hostDict iterations = self.streamLimit/self.streamIncrease last = self.streamLimit - (iterations * self.streamIncrease) if not last: iterations = iterations - 1 last = self.streamIncrease iterations = iterations + 1 seen_urls = set() for type in self.types: searchFrom = 0 searchCount = self.streamIncrease for offset in range(iterations): if iterations == offset + 1: searchCount = last urlNew = url % (type, self.api, query, searchCount, searchFrom) searchFrom = searchFrom + self.streamIncrease results = client.request(urlNew) results = json.loads(results) apistatus = results['status'] if apistatus != 'success': break results = results['result'] added = False for result in results: jsonName = result['title'] jsonSize = result['sizeinternal'] jsonExtension = result['extension'] jsonLanguage = result['lang'] jsonHoster = result['hostername'].lower() jsonLink = result['hosterurls'][0]['url'] if jsonLink in seen_urls: continue seen_urls.add(jsonLink) if not jsonHoster in hostDict: continue if not self.extensionValid(jsonExtension): continue quality, info = source_utils.get_release_quality(jsonName) info.append(self.formatSize(jsonSize)) info.append(jsonName) info = '|'.join(info) sources.append({'source' : jsonHoster, 'quality': quality, 'language' : jsonLanguage, 'url' : jsonLink, 'info': info, 'direct' : False, 'debridonly' : False}) added = True if not added: break return sources except: return sources
def url(self, filename): return urlparse.urljoin(self.base_url, filename).replace('\\', '/')
scrape_and_look_for_next_link(next_url) if 4050 < i < 4500: next_url = ListofOKCases[i] print next_url record = {} record['URL'] = next_url scraperwiki.sqlite.save(['URL'], record) scrape_and_look_for_next_link(next_url) # --------------------------------------------------------------------------- # START HERE: define your starting URL - then # call a function to scrape the first page in the series. # --------------------------------------------------------------------------- base_url = 'http://www.oscn.net/dockets/' starting_url = urlparse.urljoin( base_url, 'GetCaseInformation.aspx?db=garfield&number=CF-2011-1') print starting_url global i i = 1 #for i in range(0,1): #There are 743 cases but 468 appears to be the server request limit CaseEndingNumbers() ListOfCaseEndingNumbers = list(CaseEndingNumbers()) GetOklahomaStateCases() ListofOKCases = list(GetOklahomaStateCases()) scrape_and_look_for_next_link(starting_url) # # Read in a page # html = scraperwiki.scrape("http://foo.com") # # # Find something on the page using css selectors
def getAuthorizeEndpoint(self): if not self.url: raise Exception("oauth url error", self.url) return urljoin(self.url, self.OAUTH2_PATH['authorize'])
def relative_path_to_absolute_uri(self, relative_path): """Return an aboslute URI given a relative path taking into account the test context.""" return urlparse.urljoin(BASE_URL, relative_path)
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources if debrid.status() == False: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year'] query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) s = client.request(self.base_link) s = re.findall('\'(http.+?)\'', s) + re.findall('\"(http.+?)\"', s) s = [i for i in s if urlparse.urlparse(self.base_link).netloc in i and len(i.strip('/').split('/')) > 3] s = s[0] if s else urlparse.urljoin(self.base_link, 'posts') s = s.strip('/') url = s + self.search_link % urllib.quote_plus(query) r = client.request(url) r = client.parseDOM(r, 'h2', attrs = {'class': 'post-title'}) r = zip(client.parseDOM(r, 'a', ret='href'), client.parseDOM(r, 'a', ret='title')) r = [(i[0], i[1], re.sub('(\.|\(|\[|\s)(\d{4}|3D)(\.|\)|\]|\s|)(.+|)', '', i[1]), re.findall('[\.|\(|\[|\s](\d{4}|)([\.|\)|\]|\s|].+)', i[1])) for i in r] r = [(i[0], i[1], i[2], i[3][0][0], i[3][0][1]) for i in r if i[3]] r = [(i[0], i[1], i[2], i[3], re.split('\.|\(|\)|\[|\]|\s|\-', i[4])) for i in r] r = [i for i in r if cleantitle.get(title) == cleantitle.get(i[2]) and data['year'] == i[3]] r = [i for i in r if not any(x in i[4] for x in ['HDCAM', 'CAM', 'DVDR', 'DVDRip', 'DVDSCR', 'HDTS', 'TS', '3D'])] r = [i for i in r if '1080p' in i[4]][:1] + [i for i in r if '720p' in i[4]][:1] posts = [(i[1], i[0]) for i in r] hostDict = hostprDict + hostDict items = [] for post in posts: try: t = post[0] u = client.request(post[1]) u = re.findall('\'(http.+?)\'', u) + re.findall('\"(http.+?)\"', u) u = [i for i in u if not '/embed/' in i] u = [i for i in u if not 'youtube' in i] items += [(t, i) for i in u] except: pass for item in items: try: name = item[0] name = client.replaceHTMLCodes(name) t = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', name) if not cleantitle.get(t) == cleantitle.get(title): raise Exception() y = re.findall('[\.|\(|\[|\s](\d{4}|S\d*E\d*|S\d*)[\.|\)|\]|\s]', name)[-1].upper() if not y == hdlr: raise Exception() fmt = re.sub('(.+)(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*)(\.|\)|\]|\s)', '', name.upper()) fmt = re.split('\.|\(|\)|\[|\]|\s|\-', fmt) fmt = [i.lower() for i in fmt] if any(i.endswith(('subs', 'sub', 'dubbed', 'dub')) for i in fmt): raise Exception() if any(i in ['extras'] for i in fmt): raise Exception() if '1080p' in fmt: quality = '1080p' elif '720p' in fmt: quality = 'HD' else: quality = 'SD' if any(i in ['dvdscr', 'r5', 'r6'] for i in fmt): quality = 'SCR' elif any(i in ['camrip', 'tsrip', 'hdcam', 'hdts', 'dvdcam', 'dvdts', 'cam', 'telesync', 'ts'] for i in fmt): quality = 'CAM' info = [] if '3d' in fmt: info.append('3D') try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) (?:GB|GiB|MB|MiB))', item[2])[-1] div = 1 if size.endswith(('GB', 'GiB')) else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size))/div size = '%.2f GB' % size info.append(size) except: pass if any(i in ['hevc', 'h265', 'x265'] for i in fmt): info.append('HEVC') info = ' | '.join(info) url = item[1] if any(x in url for x in ['.rar', '.zip', '.iso']): raise Exception() url = client.replaceHTMLCodes(url) url = url.encode('utf-8') host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0] if not host in hostDict: raise Exception() host = client.replaceHTMLCodes(host) host = host.encode('utf-8') sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True}) except: pass check = [i for i in sources if not i['quality'] == 'CAM'] if check: sources = check return sources except: return sources
def videolist(params, url, category): logger.info("[veocine.py] mainlist") # ------------------------------------------------------ # Descarga la página # ------------------------------------------------------ data = scrapertools.cachePage(url) #logger.info(data) # ------------------------------------------------------ # Extrae las películas # ------------------------------------------------------ patron = '<tr.*?' patron += '<td.*?' patron += '<a href="([^"]+)">' patron += "<img src='([^']+)'.*?<a.*?>\s*(.*?)\s*<(.*?)" patron += "<img .*? alt='([^']+)' />" matches = re.compile(patron, re.DOTALL).findall(data) if DEBUG: scrapertools.printMatches(matches) for match in matches: try: scrapedtitle = unicode( match[2], "utf-8").encode("iso-8859-1") + " (" + match[4] + ")" except: scrapedtitle = match[2] + " (" + match[4] + ")" scrapedurl = urlparse.urljoin("http://www.veocine.es/", match[0]) scrapedthumbnail = "" try: scrapedplot = unicode(match[3], "utf-8").encode("iso-8859-1") except: scrapedplot = match[3] scrapedplot = scrapedplot.replace("/a>", "\n") scrapedplot = scrapedplot.replace("<br />", "\n") scrapedplot = scrapedplot.replace("<b>", "") scrapedplot = scrapedplot.replace("</b>", "") scrapedplot = scrapedplot.replace("<i>", "") scrapedplot = scrapedplot.replace("</i>", "") scrapedplot = scrapedplot.replace("<!--colorstart:#589BB9-->", "") scrapedplot = scrapedplot.replace("<!--colorend-->", "") scrapedplot = scrapedplot.replace("<!--/colorend-->", "") scrapedplot = scrapedplot.replace("<!--/colorstart-->", "") scrapedplot = scrapedplot.replace('<span style="color:#589BB9">', "") scrapedplot = scrapedplot.replace("</span>", "") scrapedplot = scrapedplot.strip() # Depuracion if DEBUG: logger.info("scrapedtitle=" + scrapedtitle) logger.info("scrapedurl=" + scrapedurl) logger.info("scrapedthumbnail=" + scrapedthumbnail) logger.info("scrapedplot=" + scrapedplot) # Añade al listado de XBMC xbmctools.addnewfolder(__channel__, "listmirrors", category, scrapedtitle, scrapedurl, scrapedthumbnail, scrapedplot) # ------------------------------------------------------ # Extrae la página siguiente # ------------------------------------------------------ patron = "<a href='([^']+)'>Siguiente</a>" matches = re.compile(patron, re.DOTALL).findall(data) if DEBUG: scrapertools.printMatches(matches) for match in matches: scrapedtitle = "Pagina siguiente" scrapedurl = urlparse.urljoin("http://www.veocine.es/", match) scrapedthumbnail = "" scrapeddescription = "" # Depuracion if DEBUG: logger.info("scrapedtitle=" + scrapedtitle) logger.info("scrapedurl=" + scrapedurl) logger.info("scrapedthumbnail=" + scrapedthumbnail) # Añade al listado de XBMC xbmctools.addthumbnailfolder(__channel__, scrapedtitle, scrapedurl, scrapedthumbnail, "mainlist") # Label (top-right)... xbmcplugin.setPluginCategory(handle=int(sys.argv[1]), category=category) # Disable sorting... xbmcplugin.addSortMethod(handle=int(sys.argv[1]), sortMethod=xbmcplugin.SORT_METHOD_NONE) # End of directory... xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=True)
def sources(self, url, hostDict): sources = [] if not url: return sources try: scraper = cfscrape.create_scraper(delay=5) data = parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] title = title.replace('&', 'and').replace('Special Victims Unit', 'SVU') aliases = data['aliases'] episode_title = data['title'] if 'tvshowtitle' in data else None year = data['year'] hdlr = 'S%02dE%02d' % (int(data['season']), int( data['episode'])) if 'tvshowtitle' in data else year query = '%s %s' % (title, hdlr) query = re.sub(r'[^A-Za-z0-9\s\.-]+', '', query) url = self.search_link % quote_plus(query) url = urljoin(self.base_link, url).replace('%3A+', '+') # log_utils.log('url = %s' % url, log_utils.LOGDEBUG) # result = scraper.get(url).content result = py_tools.ensure_str(scraper.get(url).content, errors='replace') if not result or "Sorry, but you are looking for something that isn't here" in str( result): return sources posts = client.parseDOM(result, "div", attrs={"class": "post"}) if not posts: return sources except: source_utils.scraper_error('MAXRLS') return sources for post in posts: try: post_title = client.parseDOM(post, "h2", attrs={"class": "postTitle"}) post_title = client.parseDOM(post_title, 'a')[0] if not source_utils.check_title(title, aliases, post_title, hdlr, year): continue content = client.parseDOM(post, "div", attrs={"class": "postContent"}) ltr = client.parseDOM(content, "p", attrs={"dir": "ltr"}) if not ltr: continue for i in ltr: if '<strong>' not in i or 'imdb.com' in i: continue name = re.search(r'<strong>(.*?)<', i).group(1) name = re.sub(r'(<span.*?>)', '', name).replace('</span>', '') if title not in name: continue # IMDB and Links: can be in name so check for title match name_info = source_utils.info_from_name( name, title, year, hdlr, episode_title) if source_utils.remove_lang(name_info): continue links = client.parseDOM(i, "a", ret="href") size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', i, re.DOTALL) for link in links: url = link if url in str(sources): continue valid, host = source_utils.is_host_valid(url, hostDict) if not valid: continue quality, info = source_utils.get_release_quality( name_info, url) try: dsize, isize = source_utils._size(size[0]) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) sources.append({ 'provider': 'maxrls', 'source': host, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize }) except: source_utils.scraper_error('MAXRLS') return sources
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] categ = 'tv-shows' if 'tvshowtitle' in data else 'movies' hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year'] query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % (urllib.quote_plus(query), categ) url = urlparse.urljoin(self.base_link, url) headers = {'User-Agent': self.useragent} r = self.scraper.get(url, headers=headers).content r = re.findall('<item>\s*<title>([^<>]+)<\/title>\s*<link>([^<>]+)<\/link>', r) if len(r) == 0: raise Exception() hostDict = hostprDict + hostDict items = [] for item in r: try: t = item[0].rsplit('&', 1)[0] if any(x in t.lower() for x in ['.bonus.', '.extra.', '.extras.']): raise Exception() t = re.sub('(\[.*?\])|(<.+?>)', '', t) t1 = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d+|3D)(\.|\)|\]|\s|)(.+|)', '', t) if not cleantitle.get(t1) == cleantitle.get(title): raise Exception() y = re.findall('[\.|\(|\[|\s](\d{4}|S\d*E\d*|S\d*)[\.|\)|\]|\s]', t)[-1].upper() if not y == hdlr: raise Exception() headers = {'User-Agent': self.useragent} data = self.scraper.get(item[1], headers=headers).content data = client.parseDOM(data, 'div', attrs={'class': 'cont.+?'})[0] data = dom_parser2.parse_dom(data, 'a', req='href') u = [(t, i.attrs['href']) for i in data] items += u except: pass for item in items: try: name = item[0] name = client.replaceHTMLCodes(name) quality, info = source_utils.get_release_quality(name, item[1]) url = item[1] if not url.startswith('http'): continue if any(x in url for x in ['.rar', '.zip', '.iso']): raise Exception() url = client.replaceHTMLCodes(url) url = url.encode('utf-8') valid, host = source_utils.is_host_valid(url, hostDict) if not valid: continue host = client.replaceHTMLCodes(host) host = host.encode('utf-8') sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': ' | '.join(info), 'direct': False, 'debridonly': True}) except: pass return sources except: log_utils.log('>>>> %s TRACE <<<<\n%s' % (__file__.upper().split('\\')[-1].split('.')[0], traceback.format_exc()), log_utils.LOGDEBUG) return sources
def parse(self, response): for product_url in response.xpath('//div[contains(@class, "products")]//a/@href').extract(): yield scrapy.Request(urlparse.urljoin(response.url, product_url), callback=self.__parse_product_page)
from flask import Flask, request import os import os.path import re import shelve import subprocess import threading import time import urlparse ## FIXME: This should go into a config file. REPO_FS_BASE = '/srv/release/repository/release' REPO_HTTP_BASE = 'http://packages.release.eucalyptus-systems.com/' YUM_BASE = 'yum/builds/' RPM_FS_BASE = os.path.join(REPO_FS_BASE, YUM_BASE) RPM_HTTP_BASE = urlparse.urljoin(REPO_HTTP_BASE, YUM_BASE) RESULT_CACHE_FILENAME = '/var/lib/genrepo/result-cache' # A python shelf object: the lazy man's key-value store RESULT_CACHE = None RESULT_CACHE_LOCK = threading.Lock() app = Flask(__name__) @app.route('/api/1/genrepo/', methods=['GET', 'POST']) def do_genrepo(): if request.method == 'POST': params = request.form elif request.method == 'GET': params = request.args
def parse_hansard_post_1998(self, response): sel = Selector(response) # Get the year that this index page is for # Meetings (Year 2013 - 2014) # This is mostly for debugging purposes so we can spit this out in the logs year_range = sel.xpath('//strong/em/text()').extract() if not year_range: self.log("%s: Could not find year range on hansard index page" % response.url, level=log.WARNING) return else: self.log("%s: Parsing Hansard Index: %s" % (response.url, year_range), level=log.INFO) # Find any dates at the top of this page. Other dates are identical # to this page, and indeed the current page will also be included in # the date list. Scrapy will prevent us recursing back into ourselves. year_urls = sel.xpath('//tr/td/a[contains(@href,"#toptbl")]/@href').extract() for year_url in year_urls: absolute_url = urlparse.urljoin(response.url, year_url.strip()) req = Request(absolute_url, callback = self.parse_hansard_index_page) yield req # We are looking for table rows which link to Hansard entries for a # particular date. In newer versions these are 6-columned table rows # where column 6 is a link to a webcast (doesn't seem to exist) # Older revisions are 5 columned rows. These are all after the anchor # 'hansard'. print "Parsing Rows" # Find the handsard table table = sel.xpath("//div[@class='table_overflow']//a[@name='hansard']/following::table[1]") if not table: # http://www.legco.gov.hk/general/english/counmtg/yr08-12/mtg_0910.htm table = sel.xpath("//div[@id='_content_']//a[@name='hansard']/following::table[1]") rows = table.xpath(".//tr[count(td)>=5]") if not rows: self.log("%s: Could not find any Handard entries to crawl into" % response.url, level=log.WARNING) return self.log("%s: %i rows found" % (response.url, len(rows)), level=log.INFO) for row in rows: date_info = ' '.join(row.xpath('.//td[1]/node()/text()').extract()) self.log("%s: Row: %s" % (response.url, date_info), level=log.INFO) # Recurse into the agenda, if it exists agenda_url = row.xpath('.//td[2]/a/@href').extract() if agenda_url: absolute_url = urlparse.urljoin(response.url, agenda_url[0].strip()) req = Request(absolute_url, callback = self.parse_hansard_agenda) yield req else: self.log("%s: Could not find an agenda URL for %s" % (response.url, date_info), level=log.WARNING) # Download the minutes document if it exists. This is a PDF file minutes_url = row.xpath('.//td[3]/a/@href').extract() if minutes_url: absolute_url = urlparse.urljoin(response.url, minutes_url[0].strip()) minutes = HansardMinutes() minutes['date'] = date_info minutes['file_urls'] = [absolute_url] yield minutes else: self.log("%s: Could not find an minutes URL for %s" % (response.url, date_info), level=log.WARNING) for (lang, index) in [('en',4),('cn',5)]: hansard_urls = row.xpath('.//td[%i]/a/@href' % index).extract() for url in hansard_urls: # Is this a PDF entry, or do we need to recurse? absolute_url = urlparse.urljoin(response.url, url.strip()) if absolute_url.endswith('pdf'): hansard_record = HansardRecord() hansard_record['date'] = date_info hansard_record['language'] = lang hansard_record["file_urls"] = [absolute_url] yield hansard_record else: # Recurse into the HTML handler for the HTML Handard Record Index req = Request(absolute_url, callback = self.parse_hansard_html_record) yield req if not hansard_urls: self.log("%s: Could not find an hansard URL for %s, lang %s" % (response.url, date_info, lang), level=log.WARNING)
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources if debrid.status() == False: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year'] query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % urllib.quote_plus(query) url = urlparse.urljoin(self.base_link, url) r = client.request(url) posts = client.parseDOM(r, 'item') hostDict = hostprDict + hostDict items = [] for post in posts: try: t = client.parseDOM(post, 'title')[0] c = client.parseDOM(post, 'content.+?')[0] u = re.findall('>Single Link(.+?)p>\s*<span', c.replace('\n', ''))[0] u = client.parseDOM(u, 'a', ret='href') s = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) (?:GB|GiB|MB|MiB))', c) s = s[0] if s else '0' items += [(t, i, s) for i in u] except: pass for item in items: try: name = item[0] name = client.replaceHTMLCodes(name) t = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', name) if not cleantitle.get(t) == cleantitle.get(title): raise Exception() y = re.findall('[\.|\(|\[|\s](\d{4}|S\d*E\d*|S\d*)[\.|\)|\]|\s]', name)[-1].upper() if not y == hdlr: raise Exception() quality, info = source_utils.get_release_quality(name, item[1]) try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) (?:GB|GiB|MB|MiB))', item[2])[-1] div = 1 if size.endswith(('GB', 'GiB')) else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size))/div size = '%.2f GB' % size info.append(size) except: pass info = ' | '.join(info) url = item[1] if any(x in url for x in ['.rar', '.zip', '.iso']): raise Exception() url = client.replaceHTMLCodes(url) url = url.encode('utf-8') valid, host = source_utils.is_host_valid(url,hostDict) host = client.replaceHTMLCodes(host) host = host.encode('utf-8') sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True}) except: pass check = [i for i in sources if not i['quality'] == 'CAM'] if check: sources = check return sources except: return sources
def generate_deb_repo(distro, release, arch, url, commit): if (distro, release) not in (('ubuntu', 'lucid'), ('ubuntu', 'precise'), ('debian', 'sid')): return 'Error: invalid release: %s %s' % (distro, release), 400 if url.endswith("eucalyptus"): package_name = "eucalyptus" elif url.endswith("internal"): package_name = "eucalyptus-enterprise" else: return ('Error: Invalid url. Please end your URL with "eucalyptus" ' 'or "internal"'), 400 # Truncate to 6 characters commit = commit[:6] # Locate debs pool = os.path.join(REPO_FS_BASE, distro, 'pool/main/e', package_name) pool_contents = os.listdir(pool) current_high_ver = "0" counter = 0 for euca_file in pool_contents: if (commit in euca_file and euca_file.endswith('.deb') and release in euca_file): # Now determine the newest one fields = euca_file.split("_") euca_file_ver = fields[1] if apt.VersionCompare(euca_file_ver, current_high_ver) >= 1: current_high_ver = euca_file_ver counter += 1 # eucalyptus has 10 binary packages (java-common may go away) and internal # has 4 + a dummy package if we have less than that, bail, as an invalid # hash has been detected if (package_name == 'eucalyptus' and counter < 9) or counter < 4: return ('Error: You have requested a commit that does not exist in ' 'this distro/release.'), 404 # Generate the repository time.sleep(1) timestamp = str(int(time.time())) try: subprocess.check_call([ 'generate-eucalyptus-repository', distro, release, commit + '-' + timestamp ]) except subprocess.CalledProcessError: return 'Error: failed to generate the repository', 500 current_repo_name = release + "-" + commit + "-" + timestamp for euca_file in pool_contents: if (current_high_ver in euca_file and release in euca_file and euca_file.endswith('.deb')): try: subprocess.check_call([ 'reprepro', '--keepunreferencedfiles', '-V', '-b', os.path.join(REPO_FS_BASE, distro), 'includedeb', current_repo_name, os.path.join(pool, euca_file) ]) except subprocess.CalledProcessError: return 'Error: failed to add DEBs to new repo', 500 # Return the repo information return ' '.join( ('deb', urlparse.urljoin(REPO_HTTP_BASE, distro), current_repo_name, 'main')), 201
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources if not str(url).startswith('http'): data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) if 'tvshowtitle' in data: url = '%s/drama/%s/episode-%01d/' % ( self.base_link, cleantitle.geturl( data['tvshowtitle']), int(data['episode'])) else: url = '%s/movie/%s/' % (self.base_link, cleantitle.geturl(data['title'])) url = client.request(url, timeout='10', output='geturl') if url == None: raise Exception() else: url = urlparse.urljoin(self.base_link, url) r = client.request(url, timeout='10') r = client.request(url, timeout='10') links = client.parseDOM(r, 'iframe', ret='src') for link in links: if 'vidnow' in link: r = client.request(link, timeout='10') s = re.findall('window\.atob\(\"(.*?)\"\)', r) r = re.findall('(https:.*?(openload|redirector).*?)[\'\"]', r) for i in s: i = base64.b64decode(i) try: sources.append({ 'source': 'gvideo', 'quality': directstream.googletag(i)[0]['quality'], 'language': 'ko', 'url': i, 'direct': True, 'debridonly': False }) except: pass for i in r: if 'openload' in i: try: sources.append({ 'source': 'openload', 'quality': 'SD', 'language': 'ko', 'url': i[0], 'direct': False, 'debridonly': False }) except: pass elif 'google' in i: try: sources.append({ 'source': 'gvideo', 'quality': directstream.googletag(i)[0]['quality'], 'language': 'ko', 'url': i[0], 'direct': True, 'debridonly': False }) except: pass else: pass else: pass return sources except: return sources
'[@None] Fetching language pack manifests from {0}'.format(list_url)) if not list_url.startswith(settings.LANGPACK_DOWNLOAD_BASE): log.error('[@None] Not fetching language packs from invalid URL: ' '{0}'.format(base_url)) raise ValueError('Invalid path') try: req = requests.get(list_url, verify=settings.CA_CERT_BUNDLE_PATH) except Exception, e: log.error('[@None] Error fetching language pack list {0}: {1}'.format( path, e)) return xpi_list = [ urljoin(list_base, line[-1]) for line in map(str.split, req.iter_lines()) ] allowed_file = re.compile(r'^[A-Za-z-]+\.xpi$').match for url in xpi_list: # Filter out files not in the target langpack directory. if not url.startswith(base_url): continue xpi = url[len(base_url):] # Filter out entries other than direct child XPIs. if not allowed_file(xpi): continue
def parse1(self, response): # print response.body if '页面不存在' in response.body: Cookie = {'_hc.v': '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' % (int(time.time())), 'JSESSIONID': '%s' % self.__md5sum("%s" % time.time()), 's_ViewType': '10', 'PHOENIX_ID': '0a0102f1-15c114151d0-436b141' } header['User-Agent'] = random.choice(ua_list) yield Request(response.url, errback=self.parse_failure, callback=self.parse, headers=header, cookies=Cookie, dont_filter=True) else: sel = Selector(response) detail_list = sel.xpath('//div[@class="reviews-items"]/ul/li') if detail_list: for detail in detail_list: item = DianpingcommentItem() comment_id = ''.join(detail.xpath('./a/@data-user-id').extract()) item['comment_id'] = comment_id shop_id = ''.join(re.findall('com/shop/(.*?)/review', response.url)) item['shop_id'] = shop_id href = ''.join( detail.xpath('./a/@href').extract()).strip().replace( '\n', '') name = ''.join( detail.xpath('./div[@class="main-review"]/div/a[@class="name"]/text()').extract()).strip().replace( '\n', '') # print href item['user_name'] = name user_id = href.replace('/member/', '').strip().replace('\n', '') item['user_id'] = user_id total_score = ''.join(detail.xpath( './div[@class="content"]/div[@class="user-info"]/span[1]/@class').extract()).strip().replace( '\n', '') if not total_score: total_score = ''.join(detail.xpath( './div[@class="content"]/p[@class="shop-info"]/span[1]/@class').extract()).strip().replace( '\n', '') total_score = total_score.replace('item-rank-rst irr-star', '') if total_score: total_score = int(total_score) / 10 item['total_score'] = total_score scores = detail.xpath('./div[@class="content"]/div[@class="user-info"]/div/span/text()').extract() if scores: if len(scores) == 3: score1 = scores[0] score2 = scores[1] score3 = scores[2] score1_name = score1[:-1] score1 = score1[-1:] item['score1_name'] = score1_name item['score1'] = score1 score2_name = score2[:-1] score2 = score2[-1:] item['score2_name'] = score2_name item['score2'] = score2 score3_name = score3[:-1] score3 = score3[-1:] item['score3_name'] = score3_name item['score3'] = score3 else: item['score1_name'] = '' item['score2_name'] = '' item['score3_name'] = '' item['score1'] = 0 item['score2'] = 0 item['score3'] = 0 comment_txt = ''.join(detail.xpath( './div[@class="content"]/div[@class="comment-txt"]/div/text()').extract()).strip().replace('\n', '') item['comment_text'] = comment_txt comment_dt = ''.join(detail.xpath( './div[@class="content"]/div[@class="misc-info"]/span/text()').extract()).strip().replace('\n', '') if comment_dt: comment_dt = comment_dt.replace(u'更新于', '') comment_dt = comment_dt.replace('\n', '').replace('\r', '').replace('\t', '').strip() comment_dt = comment_dt.split(u'\xa0') if comment_dt: comment_dt = comment_dt[0] if len(comment_dt) == 5: comment_dt = '2017-' + comment_dt elif len(comment_dt) == 8: comment_dt = '20' + comment_dt if ' ' in comment_dt: comment_dt = comment_dt.split(' ')[0] else: comment_dt = ''.join(detail.xpath( './div[@class="content"]/div[@class="misc-info"]/span/a[@class="time"]/text()').extract()).strip().replace( '\n', '') item['comment_dt'] = comment_dt contribution = ''.join( detail.xpath( './div[@class="pic"]/p[@class="contribution"]/span/@title').extract()).strip().replace( '\n', '') contribution = contribution.replace('贡献值', '').strip() item['user_contrib_val'] = contribution # try: # db_insert.insert('t_hh_dianping_shop_comments', **item) # except: # pass yield item next_page = sel.xpath('//a[@class="NextPage"]/@href') if next_page: next_page = ''.join(next_page.extract()) next_page = urljoin(response.url, next_page) print next_page Cookie = {'_hc.v': '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' % (int(time.time())), 'JSESSIONID': '%s' % self.__md5sum("%s" % time.time()), 's_ViewType': '10', 'PHOENIX_ID': '0a0102f1-15c114151d0-436b141' } header['User-Agent'] = random.choice(ua_list) yield Request(next_page, errback=self.parse_failure, callback=self.parse, headers=header, cookies=Cookie, dont_filter=True, ) else: print response.body
def episodios(item): logger.info("tvalacarta.channels.eltrece episodios") itemlist = [] ''' <div about="/la-noche-de-mirtha/programa-38_074529" typeof="sioc:Item foaf:Document" class="ds-1col node node--capitulo-completo view-mode-c13_capitulo_completo node--c13-capitulo-completo node--capitulo-completo--c13-capitulo-completo clearfix"> <figure data-desktop="217x122" data-tabletlandscape="217x122" data-tabletportrait="217x122" data-mobilelandscape="217x122" data-mobileportrait="217x122" alt="Programa 38 (10-01-15)" data-width="90" data-height="90" data-timestamp="1421945563" data-uri="public://2015/01/11/mirthascioli.jpg" class="field field--name-field-images field--type-image field--label-hidden" ><a href="/la-noche-de-mirtha/programa-38_074529" data-pagetype="capitulo_completo"><span class="hasvideo"></span><noscript><img src='public://styles/90x90/public/2015/01/11/mirthascioli.jpg?t=1421945563' width='90' height='90' alt='Programa 38 (10-01-15)' /></noscript></a><figcaption></figcaption></figure> <h2><a data-pagetype="capitulo_completo" href="/la-noche-de-mirtha/programa-38_074529">Programa 38 (10-01-15)</a></h2> <p>Invitados del programa de hoy: Daniel Scioli, Alejandra Maglietti, Facundo...</p></div> ''' # Descarga la página data = scrapertools.cache_page(item.url) item.url = urlparse.urljoin( item.url, scrapertools.find_single_match( data, 'href="(/[^\/]+/capitulos-completos)">Cap')) # Busca la opción de "Capítulos completos" data = scrapertools.cache_page(item.url) matches = re.compile('<figure(.*?)</div>', re.DOTALL).findall(data) for match in matches: logger.info("tvalacarta.channels.eltrece programas match=" + match) title = scrapertools.find_single_match( match, '<a data-pagetype="capitulo_completo" href="[^"]+">([^<]+)</a>') if title == "": title = scrapertools.find_single_match( match, "<figcaption>([^<]+)</figcaption>") if title == "": title = scrapertools.find_single_match(match, 'alt="([^"]+)"') title = scrapertools.htmlclean(title) url = urlparse.urljoin( item.url, scrapertools.find_single_match(match, 'a href="([^"]+)"')) thumbnail = scrapertools.find_single_match( match, 'data-uri="public\:\/\/([^"]+)"') thumbnail = "http://eltrecetv.cdncmd.com/sites/default/files/styles/298x168/public/" + thumbnail plot = scrapertools.find_single_match(match, '<p>([^<]+)</p>') if (DEBUG): logger.info("title=[" + title + "], url=[" + url + "], thumbnail=[" + thumbnail + "]") # Añade al listado itemlist.append( Item(channel=CHANNEL, action="play", server="eltrece", title=title, url=url, thumbnail=thumbnail, plot=plot, fanart=thumbnail, viewmode="movie_with_plot", folder=False)) # Paginación current_page = scrapertools.find_single_match(item.url, "page\=(\d+)") logger.info("tvalacarta.channels.eltrece programas current_page=" + current_page) if current_page == "": next_page_url = item.url + "?page=1" else: next_page_url = item.url.replace("page=" + current_page, "page=" + str(int(current_page) + 1)) logger.info("tvalacarta.channels.eltrece programas next_page_url=" + next_page_url) itemlist.append( Item(channel=CHANNEL, action="episodios", title=">> Página siguiente", url=next_page_url, folder=True)) return itemlist
def process_links(self, soup, baseurl, recursion_level, into_dir='links'): res = '' diskpath = os.path.join(self.current_dir, into_dir) if not os.path.exists(diskpath): os.mkdir(diskpath) prev_dir = self.current_dir try: self.current_dir = diskpath tags = list(soup.findAll('a', href=True)) for c, tag in enumerate(tags): if self.show_progress: print '.', sys.stdout.flush() sys.stdout.flush() iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0) if not iurl: continue nurl = self.normurl(iurl) if self.filemap.has_key(nurl): # noqa self.localize_link(tag, 'href', self.filemap[nurl]) continue if self.files > self.max_files: return res linkdir = 'link' + str(c) if into_dir else '' linkdiskpath = os.path.join(diskpath, linkdir) if not os.path.exists(linkdiskpath): os.mkdir(linkdiskpath) try: self.current_dir = linkdiskpath dsrc = self.fetch_url(iurl) newbaseurl = dsrc.newurl if len(dsrc) == 0 or \ len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0: raise ValueError('No content at URL %r' % iurl) if callable(self.encoding): dsrc = self.encoding(dsrc) elif self.encoding is not None: dsrc = dsrc.decode(self.encoding, 'replace') else: dsrc = xml_to_unicode(dsrc, self.verbose)[0] st = time.time() soup = self.get_soup(dsrc, url=iurl) self.log.debug('Parsed %s in %.1f seconds' % (iurl, time.time() - st)) base = soup.find('base', href=True) if base is not None: newbaseurl = base['href'] self.log.debug('Processing images...') self.process_images(soup, newbaseurl) if self.download_stylesheets: self.process_stylesheets(soup, newbaseurl) _fname = basename(iurl) if not isinstance(_fname, unicode): _fname.decode('latin1', 'replace') _fname = _fname.encode('ascii', 'replace').replace( '%', '').replace(os.sep, '') _fname = ascii_filename(_fname) _fname = os.path.splitext(_fname)[0][:120] + '.xhtml' res = os.path.join(linkdiskpath, _fname) self.downloaded_paths.append(res) self.filemap[nurl] = res if recursion_level < self.max_recursions: self.log.debug('Processing links...') self.process_links(soup, newbaseurl, recursion_level + 1) else: self.process_return_links(soup, newbaseurl) self.log.debug( 'Recursion limit reached. Skipping links in', iurl) if newbaseurl and not newbaseurl.startswith('/'): for atag in soup.findAll( 'a', href=lambda x: x and x.startswith('/')): atag['href'] = urlparse.urljoin( newbaseurl, atag['href'], True) if callable(self.postprocess_html_ext): soup = self.postprocess_html_ext( soup, c == 0 and recursion_level == 0 and not getattr(self, 'called_first', False), self.job_info) if c == 0 and recursion_level == 0: self.called_first = True save_soup(soup, res) self.localize_link(tag, 'href', res) except Exception as err: if isinstance(err, AbortArticle): raise self.failed_links.append((iurl, traceback.format_exc())) self.log.exception('Could not fetch link', iurl) finally: self.current_dir = diskpath self.files += 1 finally: self.current_dir = prev_dir if self.show_progress: print return res
# Start running maps and POSTs for m in dataMaps: #print 'Building %s map' % m data = { 'episode_id': 1 } for d in dataMaps[m]['map']: if parts[dataMaps[m]['map'][d]] != '-': #print '%s: %s' % (d, parts[dataMaps[m]['map'][d]]) data[d] = parts[dataMaps[m]['map'][d]] #print 'data is:' #print data #print 'Making url from %s and %s' % (baseUrl, dataMaps[m]['uri']) url = urlparse.urljoin(baseUrl, dataMaps[m]['uri']) print url urlBits = urlparse.urlparse(url) params = urllib.urlencode({'number': 12524, 'type': 'issue', 'action': 'show'}) headers = { "Content-type": "application/json", "Accept": "application/json", "Authorization": "Token " + token } print 'sending ' + json.dumps(data, separators=(',',':')) response = requests.post(url, data=json.dumps(data, separators=(',',':')), headers=headers) #print response.status, response.reason
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag. has_key('src')): # noqa iurl = tag['src'] if iurl.startswith('data:image/'): try: data = b64decode(iurl.partition(',')[-1]) except: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlparse.urlsplit(iurl).scheme: iurl = urlparse.urljoin(baseurl, iurl, False) with self.imagemap_lock: if self.imagemap.has_key(iurl): # noqa tag['src'] = self.imagemap[iurl] continue try: data = self.fetch_url(iurl) if data == 'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img' + str(c)) if isinstance(fname, unicode): fname = fname.encode('ascii', 'replace') itype = what(None, data) if itype is None and b'<svg' in data[:1024]: # SVG image imgpath = os.path.join(diskpath, fname + '.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: try: # Ensure image is valid img = image_from_data(data) if itype not in {'png', 'jpg', 'jpeg'}: itype = 'png' if itype == 'gif' else 'jpeg' data = image_to_data(img, fmt=itype) if self.compress_news_images and itype in {'jpg', 'jpeg'}: try: data = self.rescale_image(data) except Exception: self.log.exception('failed to compress image ' + iurl) # Moon+ apparently cannot handle .jpeg files if itype == 'jpeg': itype = 'jpg' imgpath = os.path.join(diskpath, fname + '.' + itype) with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath except Exception: traceback.print_exc() continue
def _get_issuer_publickey(self, issuer, key_id=None, insecure=False): # Set the user agent so Cloudflare isn't mad at us headers={'User-Agent': 'SciTokens/{}'.format(PKG_VERSION)} # Go to the issuer's website, and download the OAuth well known bits # https://tools.ietf.org/html/draft-ietf-oauth-discovery-07 well_known_uri = ".well-known/openid-configuration" if not issuer.endswith("/"): issuer = issuer + "/" parsed_url = urlparse.urlparse(issuer) updated_url = urlparse.urljoin(parsed_url.path, well_known_uri) parsed_url_list = list(parsed_url) parsed_url_list[2] = updated_url meta_uri = urlparse.urlunparse(parsed_url_list) # Make sure the protocol is https if not insecure: parsed_url = urlparse.urlparse(meta_uri) if parsed_url.scheme != "https": raise NonHTTPSIssuer("Issuer is not over HTTPS. RFC requires it to be over HTTPS") response = request.urlopen(request.Request(meta_uri, headers=headers)) data = json.loads(response.read().decode('utf-8')) # Get the keys URL from the openid-configuration jwks_uri = data['jwks_uri'] # Now, get the keys if not insecure: parsed_url = urlparse.urlparse(jwks_uri) if parsed_url.scheme != "https": raise NonHTTPSIssuer("jwks_uri is not over HTTPS, insecure!") response = request.urlopen(request.Request(jwks_uri, headers=headers)) keys_data = json.loads(response.read().decode('utf-8')) # Loop through each key, looking for the right key id public_key = "" raw_key = None # If there is no kid in the header, then just take the first key? if key_id == None: if len(keys_data['keys']) != 1: raise NotImplementedError("No kid in header, but multiple keys in " "response from certs server. Don't know which key to use!") else: raw_key = keys_data['keys'][0] else: # Find the right key for key in keys_data['keys']: if key['kid'] == key_id: raw_key = key break if raw_key == None: raise MissingKeyException("Unable to find key at issuer {}".format(jwks_uri)) if raw_key['kty'] == "RSA": public_key_numbers = rsa.RSAPublicNumbers( long_from_bytes(raw_key['e']), long_from_bytes(raw_key['n']) ) public_key = public_key_numbers.public_key(backends.default_backend()) elif raw_key['kty'] == 'EC': public_key_numbers = ec.EllipticCurvePublicNumbers( long_from_bytes(raw_key['x']), long_from_bytes(raw_key['y']), ec.SECP256R1 ) public_key = public_key_numbers.public_key(backends.default_backend()) else: raise UnsupportedKeyException("SciToken signed with an unsupported key type") return public_key