def split_contents(self): """ Iterates over the elements in the block """ if self.split_content: return self.split_content split = self.soup.findAll({'link' : True, 'style' : True}) for elem in split: if elem.name == 'link' and elem['rel'] == 'stylesheet': filename = self.get_filename(elem['href']) path, ext = os.path.splitext(filename) if ext in settings.COMPILER_FORMATS.keys(): if self.recompile(filename): self.compile(path,settings.COMPILER_FORMATS[ext]) basename = os.path.splitext(os.path.basename(filename))[0] elem = BeautifulSoup(re.sub(basename+ext,basename+'.css',unicode(elem))) filename = path + '.css' try: self.split_content.append(('file', filename, elem)) except UncompressableFileError: if django_settings.DEBUG: raise if elem.name == 'style': data = elem.string elem_type = elem.get('type', '').lower() if elem_type and elem_type != "text/css": # it has to be preprocessed if '/' in elem_type: # we accept 'text/ccss' and plain 'ccss' too elem_type = elem_type.split('/')[1] # TODO: that dot-adding compatibility stuff looks strange. # do we really need a dot in COMPILER_FORMATS keys? ext = '.'+elem_type data = self.compile_inline(data,ext) elem = ''.join(("<style type='text/css'>\n",data,"\n</style>")) self.split_content.append(('hunk', data, elem)) return self.split_content
def item_enclosure_url(self, item): """Returns an image for enclosure""" if item.image: url = item.image.url else: img = BeautifulSoup(item.html_content).find('img') url = img.get('src') if img else None return urljoin(self.site_url, url) if url else None
def authenticate_with_geni(uname, password, idprovider): """ Authenticate with GEE via the openid protocol. We actually do two redirects here: GEE -> GENI Portal -> Open id provider We could technically go straight to the GENI Portal but we area really related to the GEE so if somehow the GEE protocol changes we dont have to change anything here. Also the gee uses a really nice library for openid auth so we can piggyback on that """ genilogin_url = 'https://portal.geni.net/secure/home.php' # set up sslv3 adapter for shibboleth authentication session = requests.Session() session.mount('https://', RequestsSSLv3Adapter()) # get identity provider list so we can grab the correct url try: idprovider_url = get_idprovider_url(idprovider) except KeyError: # stdout shows up in the swift log.. as info though print 'ID Provider %s, can not be found' % (idprovider) return False resp = session.get(genilogin_url) returl = parse_url_query_params(resp.url)['return'][0] # get the geni portal page with the required login params retparams = parse_url_query_params(returl) params = {'SAMLDS': retparams['SAMLDS'][0], 'target': retparams['target'][0], 'entityID': idprovider_url} heads = {'Referer': resp.url} resp = session.get(returl, params=params, headers=heads) # actully attempt to log in with the id provider loginurl = resp.url formdata = {'j_username': uname, 'j_password': password} resp = session.post(resp.url, data=formdata, headers={'Referer':resp.url}) # if the response url is still equal to the login url we have failed login if resp.url == loginurl: print 'Login attempt failed' return False # go back to the geni portal with a login token authform = BeautifulSoup(resp.text).find('form') auth_redirect_url = authform.get('action') auth_relay_state = authform.find('input', attrs={'name':'RelayState'}).get('value') auth_token = authform.find('input', attrs={'name':'SAMLResponse'}).get('value') resp = session.post(auth_redirect_url, data={'RelayState':auth_relay_state, 'SAMLResponse': auth_token}) # return if the request was successful, if so we are logged in! return resp.ok
def unwrap_html_body(html, css_class=None): """Return the content of the body tag for inline display in another html document. """ soup = BeautifulSoup(html, fromEncoding='utf8') if soup.body: soup = soup.body body_soup = BeautifulSoup('<div>%s</div>' % soup.renderContents(), fromEncoding='utf8') if css_class: body_soup.div['class'] = css_class body_style = soup.get('style') if body_style: body_soup.div['style'] = body_style return body_soup.renderContents()
def split_contents(self): """ Iterates over the elements in the block """ if self.split_content: return self.split_content split = self.soup.findAll({'link' : True, 'style' : True}) for elem in split: if elem.name == 'link' and elem['rel'] == 'stylesheet': filename = self.get_filename(elem['href']) path, ext = os.path.splitext(filename) if ext in settings.COMPILER_FORMATS.keys(): # that thing can be compiled try: css = pythonic_compile(open(filename).read(), ext) self.split_content.append({'data': css, 'elem': elem, 'filename': filename}) continue except PythonicCompilerNotFound: pass # let's run binary if self.recompile(filename): self.compile(path,settings.COMPILER_FORMATS[ext]) # filename and elem are fiddled to have link to plain .css file basename = os.path.splitext(os.path.basename(filename))[0] elem = BeautifulSoup(re.sub(basename+ext,basename+'.css',unicode(elem))) filename = path + '.css' try: self.split_content.append({'filename': filename, 'elem': elem}) except UncompressableFileError: if django_settings.DEBUG: raise if elem.name == 'style': data = elem.string elem_type = elem.get('type', '').lower() if elem_type and elem_type != "text/css": # it has to be preprocessed if '/' in elem_type: # we accept 'text/ccss' and plain 'ccss' too elem_type = elem_type.split('/')[1] # TODO: that dot-adding compatibility stuff looks strange. # do we really need a dot in COMPILER_FORMATS keys? ext = '.'+elem_type data = pythonic_compile(data, ext) self.split_content.append({'data': data, 'elem': elem}) return self.split_content
def scrapeIntradayData(html): ''' Pull the datapoints out of an activity chart (Steps, Floors, Calories Burned). Returns the counts as a list, and the activity type. ''' strain_intraday_data = SoupStrainer(name='section', attrs={'class':'chart selected', 'id':re.compile('intraday.*Chart')}) strain_data_point = SoupStrainer(name='rect' , attrs={'width':'1'}) soup = BeautifulSoup(html, parseOnlyThese=strain_intraday_data) # I think there should only ever be one chart selected if len(soup.contents) > 1: print 'This should not have happened!' print soup assert len(soup.contents) == 1 soup = soup.contents[0] data_points = soup.findAll(strain_data_point) data_points = np.array([(int(i.get('x')), int(i.get('height'))) for i in data_points]) return data_points, soup.get('id')[:-5]
def split_contents(self): """ Iterates over the elements in the block """ if self.split_content: return self.split_content split = self.soup.findAll({'link': True, 'style': True}) for elem in split: if elem.name == 'link' and (elem['rel'] == 'stylesheet' or elem['rel'] == 'stylesheet/less'): filename = self.get_filename(elem['href']) path, ext = os.path.splitext(filename) if ext in settings.COMPILER_FORMATS.keys(): if self.recompile(filename): self.compile(path, settings.COMPILER_FORMATS[ext]) basename = os.path.splitext(os.path.basename(filename))[0] elem = BeautifulSoup( re.sub(basename + ext, basename + '.css', unicode(elem))) filename = path + '.css' try: self.split_content.append(('file', filename, elem)) except UncompressableFileError: if django_settings.DEBUG: raise if elem.name == 'style': data = elem.string elem_type = elem.get('type', '').lower() if elem_type and elem_type != "text/css": # it has to be preprocessed if '/' in elem_type: # we accept 'text/ccss' and plain 'ccss' too elem_type = elem_type.split('/')[1] # TODO: that dot-adding compatibility stuff looks strange. # do we really need a dot in COMPILER_FORMATS keys? ext = '.' + elem_type data = self.compile_inline(data, ext) elem = ''.join( ("<style type='text/css'>\n", data, "\n</style>")) self.split_content.append(('hunk', data, elem)) return self.split_content
def extract_forms(self, url): response = self.session.get(url) parsed_html = BeautifulSoup(response.content) return parsed_html.get("form")
def loopLinks(siteURL): #were links loaded from this site? if not - load them siteURL.replace("\r","") siteURL.replace("\n","") siteURL.strip() rs = scraperwiki.sqlite.select("count(*) as freq from swdata where src=?", siteURL) for d in rs: if d["freq"]==0: req = Request(siteURL) try: soup = BeautifulSoup(urlopen(req)) except HTTPError, e: print 'The server could n0t fulfill the request.' print 'Error code: ', e.code except URLError, e: print 'We failed to reach a server.' print 'Reason: ', e.reason else: aDomain = siteURL.replace("http://","") aDomain = aDomain.replace("https://","") aDomain = aDomain[:aDomain.find('/')] for tag in soup.findAll('a'): linkURL = str(tag.get('href')) addLink = "true" #get absolute link if linkURL.find('/')==0: linkURL = 'http://' + aDomain + linkURL else: if linkURL.find('http://' + aDomain + '/')!= 0: addLink = "false" #ensure this is not the site url / home link if linkURL == siteURL: addLink = "false" if linkURL == 'http://' + origDomain + '/': addLink = "false" linkURL.replace("\r","") linkURL.replace("\n","") linkURL.strip() #ensure link has not already been loaded rs = scraperwiki.sqlite.select("count(*) as freq from swdata where href=?", linkURL) for d in rs: if d["freq"]>0: addLink = "false" if addLink!="false": print linkURL if str(linkURL).find("http://mesfilmsbelgacomtv.skynet.be/fr/films/")==0: print "Film: " + linkURL #get title, year, genre filmReq = Request(linkURL) try: details = BeautifulSoup(urlopen(filmReq)) except HTTPError, e: print 'The server could n0t fulfill the request.' print 'Error code: ', e.code except URLError, e: print 'We failed to reach a server.' print 'Reason: ', e.reason else: fYear = str("unknown") print fYear fGenre = str("unknown") print fGenre for yr in details.findAll(attrs={'class': 'year'}): if fYear == "unknown": fYear = str(tag.get('class')) print str(fYear) for gNome in details.findAll(attrs={'id': 'wrapperGenome'}): for ul in gNome.findAll('ul'): fndGenre = str("false") for li in ul.findAll('li'): if tag.__class__ == NavigableString: if fndGenre=="true": fGenre = str(fGenre).replace("unknown","") + tag if tag == "Genres:": fndGenre = str("true") print str(tag.get('title')).replace(" - Les films de Belgacom TV","") data = { 'page' : details.html.head.title.contents, 'domain' : aDomain, 'src' : siteURL, 'text' : details.NavigableString, 'title' : details.get('title'), 'class' : details.get('class'), 'id' : details.get('id'), 'href' : linkURL, 'film' : str(details.get('title')).replace(" - Les films de Belgacom TV",""), 'genre' : '', 'price' : '', 'priceSD' : '', 'priceHD' : '', 'releaseYear' : str(fYear), 'origCountry' : '', 'rating' : '', 'actors' : '', 'actresses' : '' } scraperwiki.sqlite.save(unique_keys=['href'], data=data) else: print "Not a film: " + linkURL data = { 'page' : soup.html.head.title.contents, 'domain' : aDomain, 'src' : siteURL, 'text' : tag.NavigableString, 'title' : tag.get('title'), 'class' : tag.get('class'), 'id' : tag.get('id'), 'href' : linkURL, 'film' : '', 'genre' : '', 'price' : '', 'priceSD' : '', 'priceHD' : '', 'releaseYear' : '', 'origCountry' : '', 'rating' : '', 'actors' : '', 'actresses' : '' } scraperwiki.sqlite.save(unique_keys=['href'], data=data) if str(linkURL).find("http://mesfilmsbelgacomtv.skynet.be/fr/similar/")!=0: if linkURL!="http://mesfilmsbelgacomtv.skynet.be/fr/": loopLinks(linkURL)
import requests from BeautifulSoup import BeautifulSoup as BS resp = requests.get('http://example.com') form = BS(resp.text).find('form') # send POST request to "Accept" URL data = {input.get('name'):input.get('value') for input in form.findAll('input') if input.get('name') is not None} action = form.get('action') resp = requests.post("http:"+action, data=data) print("Success." if resp.status_code==200 else "Failure: POST to {} returned {}".format(action, str(resp)))
def get_baseurl(html): "get the base url" constrained = SoupStrainer(["BASE", "base"]) base = BeautifulSoup(html, parseOnlyThese=constrained).find(["BASE", "base"]) return base.get("href") or base.get("HREF")