Beispiel #1
0
 def split_contents(self):
     """ Iterates over the elements in the block """
     if self.split_content:
         return self.split_content
     split = self.soup.findAll({'link' : True, 'style' : True})
     for elem in split:
         if elem.name == 'link' and elem['rel'] == 'stylesheet':
             filename = self.get_filename(elem['href'])
             path, ext = os.path.splitext(filename)
             if ext in settings.COMPILER_FORMATS.keys():
                 if self.recompile(filename):
                     self.compile(path,settings.COMPILER_FORMATS[ext])
                 basename = os.path.splitext(os.path.basename(filename))[0]
                 elem = BeautifulSoup(re.sub(basename+ext,basename+'.css',unicode(elem)))
                 filename = path + '.css'
             try:
                 self.split_content.append(('file', filename, elem))
             except UncompressableFileError:
                 if django_settings.DEBUG:
                     raise
         if elem.name == 'style':
             data = elem.string            
             elem_type = elem.get('type', '').lower()
             if elem_type and elem_type != "text/css":
                 # it has to be preprocessed
                 if '/' in elem_type:
                     # we accept 'text/ccss' and plain 'ccss' too
                     elem_type = elem_type.split('/')[1]
                 # TODO: that dot-adding compatibility stuff looks strange.
                 # do we really need a dot in COMPILER_FORMATS keys?
                 ext = '.'+elem_type
                 data = self.compile_inline(data,ext)
                 elem = ''.join(("<style type='text/css'>\n",data,"\n</style>"))
             self.split_content.append(('hunk', data, elem))
     return self.split_content
Beispiel #2
0
 def item_enclosure_url(self, item):
     """Returns an image for enclosure"""
     if item.image:
         url = item.image.url
     else:
         img = BeautifulSoup(item.html_content).find('img')
         url = img.get('src') if img else None
     return urljoin(self.site_url, url) if url else None
Beispiel #3
0
 def item_enclosure_url(self, item):
     """Returns an image for enclosure"""
     if item.image:
         url = item.image.url
     else:
         img = BeautifulSoup(item.html_content).find('img')
         url = img.get('src') if img else None
     return urljoin(self.site_url, url) if url else None
Beispiel #4
0
def authenticate_with_geni(uname, password, idprovider):
  """ Authenticate with GEE via the openid protocol. 
  We actually do two redirects here: GEE -> GENI Portal -> Open id provider 
  We could technically go straight to the GENI Portal but we area really
  related to the GEE so if somehow the GEE protocol changes we dont have to 
  change anything here. Also the gee uses a really nice library for openid 
  auth so we can piggyback on that """

  genilogin_url = 'https://portal.geni.net/secure/home.php'


  # set up sslv3 adapter for shibboleth authentication
  session = requests.Session()
  session.mount('https://', RequestsSSLv3Adapter())

  # get identity provider list so we can grab the correct url
  try: idprovider_url = get_idprovider_url(idprovider)
  except KeyError:
    # stdout shows up in the swift log.. as info though
    print 'ID Provider %s, can not be found' % (idprovider)
    return False

  resp = session.get(genilogin_url)
  returl = parse_url_query_params(resp.url)['return'][0]

  # get the geni portal page with the required login params
  retparams = parse_url_query_params(returl)
  params = {'SAMLDS': retparams['SAMLDS'][0], 'target': retparams['target'][0], 'entityID': idprovider_url}
  heads = {'Referer': resp.url}
  resp = session.get(returl, params=params, headers=heads)
  
  # actully attempt to log in with the id provider
  loginurl = resp.url
  formdata = {'j_username': uname, 'j_password': password}
  resp = session.post(resp.url, data=formdata, headers={'Referer':resp.url})

  # if the response url is still equal to the login url we have failed login
  if resp.url == loginurl:
    print 'Login attempt failed'
    return False

  # go back to the geni portal with a login token
  authform = BeautifulSoup(resp.text).find('form')
  auth_redirect_url = authform.get('action')
  auth_relay_state =  authform.find('input', attrs={'name':'RelayState'}).get('value')
  auth_token = authform.find('input', attrs={'name':'SAMLResponse'}).get('value')
  resp = session.post(auth_redirect_url, data={'RelayState':auth_relay_state, 'SAMLResponse': auth_token})

  # return if the request was successful, if so we are logged in!
  return resp.ok
Beispiel #5
0
def unwrap_html_body(html, css_class=None):
    """Return the content of the body tag for inline display in another
    html document.
    """
    soup = BeautifulSoup(html, fromEncoding='utf8')
    if soup.body:
        soup = soup.body
    body_soup = BeautifulSoup('<div>%s</div>' % soup.renderContents(), fromEncoding='utf8')
    if css_class:
        body_soup.div['class'] = css_class
    body_style = soup.get('style')
    if body_style:
        body_soup.div['style'] = body_style
    return body_soup.renderContents()
Beispiel #6
0
    def split_contents(self):
        """ Iterates over the elements in the block """
        if self.split_content:
            return self.split_content
        split = self.soup.findAll({'link' : True, 'style' : True})
        for elem in split:
            if elem.name == 'link' and elem['rel'] == 'stylesheet':
                filename = self.get_filename(elem['href'])
                path, ext = os.path.splitext(filename)
                if ext in settings.COMPILER_FORMATS.keys():
                    # that thing can be compiled

                    try:
                        css = pythonic_compile(open(filename).read(), ext)
                        self.split_content.append({'data': css, 'elem': elem, 'filename': filename})
                        continue
                    except PythonicCompilerNotFound:
                        pass

                    # let's run binary
                    if self.recompile(filename):
                        self.compile(path,settings.COMPILER_FORMATS[ext])
                    # filename and elem are fiddled to have link to plain .css file
                    basename = os.path.splitext(os.path.basename(filename))[0]
                    elem = BeautifulSoup(re.sub(basename+ext,basename+'.css',unicode(elem)))
                    filename = path + '.css'
                try:
                    self.split_content.append({'filename': filename, 'elem': elem})
                except UncompressableFileError:
                    if django_settings.DEBUG:
                        raise
            if elem.name == 'style':
                data = elem.string
                elem_type = elem.get('type', '').lower()
                if elem_type and elem_type != "text/css":
                    # it has to be preprocessed
                    if '/' in elem_type:
                        # we accept 'text/ccss' and plain 'ccss' too
                        elem_type = elem_type.split('/')[1]
                    # TODO: that dot-adding compatibility stuff looks strange.
                    # do we really need a dot in COMPILER_FORMATS keys?
                    ext = '.'+elem_type
                    data = pythonic_compile(data, ext)

                self.split_content.append({'data': data, 'elem': elem})

        return self.split_content
Beispiel #7
0
def scrapeIntradayData(html):
    '''
    Pull the datapoints out of an activity chart (Steps, Floors, Calories Burned).
    
    Returns the counts as a list, and the activity type.
    '''
    
    strain_intraday_data = SoupStrainer(name='section', attrs={'class':'chart selected', 'id':re.compile('intraday.*Chart')})
    strain_data_point    = SoupStrainer(name='rect'   , attrs={'width':'1'})
    
    soup = BeautifulSoup(html, parseOnlyThese=strain_intraday_data)
    # I think there should only ever be one chart selected
    if len(soup.contents) > 1:
        print 'This should not have happened!'
        print soup
        assert len(soup.contents) == 1
    soup = soup.contents[0]

    data_points = soup.findAll(strain_data_point)
    data_points = np.array([(int(i.get('x')), int(i.get('height'))) for i in data_points])
    
    return data_points, soup.get('id')[:-5]
Beispiel #8
0
 def split_contents(self):
     """ Iterates over the elements in the block """
     if self.split_content:
         return self.split_content
     split = self.soup.findAll({'link': True, 'style': True})
     for elem in split:
         if elem.name == 'link' and (elem['rel'] == 'stylesheet'
                                     or elem['rel'] == 'stylesheet/less'):
             filename = self.get_filename(elem['href'])
             path, ext = os.path.splitext(filename)
             if ext in settings.COMPILER_FORMATS.keys():
                 if self.recompile(filename):
                     self.compile(path, settings.COMPILER_FORMATS[ext])
                 basename = os.path.splitext(os.path.basename(filename))[0]
                 elem = BeautifulSoup(
                     re.sub(basename + ext, basename + '.css',
                            unicode(elem)))
                 filename = path + '.css'
             try:
                 self.split_content.append(('file', filename, elem))
             except UncompressableFileError:
                 if django_settings.DEBUG:
                     raise
         if elem.name == 'style':
             data = elem.string
             elem_type = elem.get('type', '').lower()
             if elem_type and elem_type != "text/css":
                 # it has to be preprocessed
                 if '/' in elem_type:
                     # we accept 'text/ccss' and plain 'ccss' too
                     elem_type = elem_type.split('/')[1]
                 # TODO: that dot-adding compatibility stuff looks strange.
                 # do we really need a dot in COMPILER_FORMATS keys?
                 ext = '.' + elem_type
                 data = self.compile_inline(data, ext)
                 elem = ''.join(
                     ("<style type='text/css'>\n", data, "\n</style>"))
             self.split_content.append(('hunk', data, elem))
     return self.split_content
 def extract_forms(self, url):
     response = self.session.get(url)
     parsed_html = BeautifulSoup(response.content)
     return parsed_html.get("form")
def loopLinks(siteURL):
    #were links loaded from this site? if not - load them

    siteURL.replace("\r","")
    siteURL.replace("\n","")
    siteURL.strip()
    
    rs = scraperwiki.sqlite.select("count(*) as freq from swdata where src=?", siteURL)

    for d in rs:

        if d["freq"]==0:
            req = Request(siteURL)
            try:
                soup = BeautifulSoup(urlopen(req))    
            except HTTPError, e:
                print 'The server could n0t fulfill the request.'
                print 'Error code: ', e.code
            except URLError, e:
                print 'We failed to reach a server.'
                print 'Reason: ', e.reason
            else:
                aDomain = siteURL.replace("http://","")
                aDomain = aDomain.replace("https://","")
                aDomain = aDomain[:aDomain.find('/')]
    
                for tag in soup.findAll('a'):
    
                    linkURL = str(tag.get('href'))
                    addLink = "true"
    
                    #get absolute link
                    if linkURL.find('/')==0:
                       linkURL = 'http://' + aDomain + linkURL
                    else:
                        if linkURL.find('http://' + aDomain + '/')!= 0:
                            addLink = "false"
                    #ensure this is not the site url / home link
    
                    if linkURL == siteURL: addLink = "false"
    
                    if linkURL == 'http://' + origDomain + '/': addLink = "false"
                    linkURL.replace("\r","")
                    linkURL.replace("\n","")
                    linkURL.strip()
                    #ensure link has not already been loaded
                    rs = scraperwiki.sqlite.select("count(*) as freq from swdata where href=?", linkURL)
                    for d in rs:
                        if d["freq"]>0: addLink = "false"
    
                    if addLink!="false":
                        print linkURL
                        if str(linkURL).find("http://mesfilmsbelgacomtv.skynet.be/fr/films/")==0:
                            print "Film: " + linkURL
                            #get title, year, genre
                            filmReq = Request(linkURL)
                            try:
                                details = BeautifulSoup(urlopen(filmReq))    
                            except HTTPError, e:
                                print 'The server could n0t fulfill the request.'
                                print 'Error code: ', e.code
                            except URLError, e:
                                print 'We failed to reach a server.'
                                print 'Reason: ', e.reason
                            else:
        
                                fYear = str("unknown")
                                print fYear
                                
                                fGenre = str("unknown")
                                print fGenre
        
                                for yr in details.findAll(attrs={'class': 'year'}):
                                    if fYear == "unknown": 
                                        fYear = str(tag.get('class'))
            
                                print str(fYear)
        
                                for gNome in details.findAll(attrs={'id': 'wrapperGenome'}):
                                    for ul in gNome.findAll('ul'):
                                        fndGenre = str("false")
                                        for li in ul.findAll('li'):
                                            if tag.__class__ == NavigableString:            
                                                if fndGenre=="true":
                                                    fGenre = str(fGenre).replace("unknown","") + tag
                                                if tag == "Genres:": 
                                                    fndGenre = str("true")
        
                                print str(tag.get('title')).replace(" - Les films de Belgacom TV","")
                                                
                                data = { 
                                    'page' : details.html.head.title.contents,
                                    'domain' : aDomain,
                                    'src' : siteURL,
                                    'text' : details.NavigableString,
                                    'title' : details.get('title'),
                                    'class' : details.get('class'),
                                    'id' : details.get('id'),
                                    'href' : linkURL,
                                    'film' : str(details.get('title')).replace(" - Les films de Belgacom TV",""),
                                    'genre' : '', 
                                    'price' : '',
                                    'priceSD' : '',
                                    'priceHD' : '',
                                        'releaseYear' : str(fYear),
                                    'origCountry' : '',
                                    'rating' : '',
                                    'actors' : '',
                                    'actresses' : ''
                                }
                                scraperwiki.sqlite.save(unique_keys=['href'], data=data)
        
                        else:
                            print "Not a film: " + linkURL
                            data = {
                                'page' : soup.html.head.title.contents,
                                'domain' : aDomain,
                                'src' : siteURL,
                                'text' : tag.NavigableString,
                                'title' : tag.get('title'),
                                'class' : tag.get('class'),
                                'id' : tag.get('id'),
                                'href' : linkURL,
                                'film' : '',
                                'genre' : '',
                                'price' : '',
                                'priceSD' : '',
                                'priceHD' : '',
                                'releaseYear' : '',
                                'origCountry' : '',
                                'rating' : '',
                                'actors' : '',
                                'actresses' : ''
                            }
                            scraperwiki.sqlite.save(unique_keys=['href'], data=data)
                            if str(linkURL).find("http://mesfilmsbelgacomtv.skynet.be/fr/similar/")!=0:
                                if linkURL!="http://mesfilmsbelgacomtv.skynet.be/fr/":
                                    loopLinks(linkURL)
import requests
from BeautifulSoup import BeautifulSoup as BS

resp = requests.get('http://example.com')
form = BS(resp.text).find('form')

# send POST request to "Accept" URL
data = {input.get('name'):input.get('value') for input in form.findAll('input') if input.get('name') is not None}
action = form.get('action')
resp = requests.post("http:"+action, data=data)

print("Success." if resp.status_code==200 else "Failure: POST to {} returned {}".format(action, str(resp)))

Beispiel #12
0
def get_baseurl(html):
    "get the base url"
    constrained = SoupStrainer(["BASE", "base"])
    base = BeautifulSoup(html, parseOnlyThese=constrained).find(["BASE", "base"])
    return base.get("href") or base.get("HREF")