Python BeautifulSoup Examples, BeautifulSoup.BeautifulSoup.BeautifulSoup Python Examples

Example #1

0

Show file

File: rbs_scraper.py Project: JamieMcNaught/Bank-Scraper

    def doStep6(self, page):
        
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)
        
        self.output_page("RBS-acclink.html", page)
    
        loginform=soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('RBS no view account form')
            return 'bank error'


        action = self.urlBase + '/' + loginform['action']
        
        values = self.parseForm(loginform)
        
        # fill in our selection - 1 month
        values['ctl00$mainContent$SS2SPDDA'] = 'M1'

        # default button - needed
        values['ctl00$mainContent$NextButton_button'] = 'View Transactions'

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 20
        

        return 'good'

Example #2

0

Show file

File: api.py Project: saketkc/Google-AppEngine-Projects

    def get(self):
        dates = self.request.get('date')
        froms = self.request.get("from")
        to = self.request.get("to")
        #froms= "BOM"
        #to = "CBJ"
        resp=urlfetch.fetch("http://www.cleartrip.com/flights/results?from="+froms+"&to="+to+"&depart_date="+dates+"&adults=1&childs=0&infants=0&dep_time=0&class=Economy&airline=&carrier=&x=57&y=16&flexi_search=no&tb=n")
        soup = BS(resp.content)
        my_content = soup.find("script",{"id":"json"})
        string_resp = str(my_content).strip()
        #self.response.out.write(str(string_resp))
        resp_splitted = string_resp.split(';')
        #self.response.out.write(str(resp_splitted))
        self.response.headers['Content-Type'] = 'text/html'
        self.response.out.write('<html><body><table>')
        a = 2-len(resp_splitted)
        
        
        
        #self.response.out.write(string_resp)
        #query2 = resp_splitted[-10].split('=')
        #self.response.out.write(query2[1])
        """content = eval(query2[1])
        self.response.out.write('<tr><td>Price</td>')
        self.response.out.write('<td>'+content['pr']+'</td></tr>')
        legs = content['legs']
        i = 0
        for leg in legs:
			self.response.out.write('<tr><td>Way '+str(i)+':</td></tr>')
			self.response.out.write('<td>'+leg['fr'] + "to "+ leg['to'] +'</td>')
			self.response.out.write('<tr><td>Arrival '+str(i)+':</td></tr>')
			self.response.out.write ('<td>'+leg['a']+'</td>')
			self.response.out.write('<tr><td>Departure '+str(i)+':</td></tr>')
			self.response.out.write ('<td>'+leg['dp']+'</td>')
			i+=1"""
	
        
        for query in range(a,-9):
			query2 = resp_splitted[query].strip().split('=')
			
			
			try:
				content = eval(query2[1])
				self.response.out.write("<tr><td>******************</td></tr>")
				self.response.out.write('<tr><td>Price</td>')
				self.response.out.write('<td>'+str(content.get('pr'))+'</td></tr>')
				legs = content.get('legs')
				i = 0
				for leg in legs:
					i+=1
					self.response.out.write('<tr><td>Way '+str(i)+':</td>')
					self.response.out.write('<td>'+leg.get('fr') + " => "+ leg['to'] +'</td></tr>')
					self.response.out.write('<tr><td>Arrival '+str(i)+':</td>')
					self.response.out.write ('<td>'+str(leg.get('a'))+'</td></tr>')
					self.response.out.write('<tr><td>Departure '+str(i)+':</td>')
					self.response.out.write ('<td>'+str(leg.get('dp'))+'</td></tr>')
					
				
			except:
				pass

Example #3

0

Show file

File: rbs_scraper.py Project: JamieMcNaught/Bank-Scraper

    def doAllLink(self, page):
       
        soup = BeautifulSoup(page)
        
        self.output_page("RBS-xactlist-all-look.html", page)

        #<a href="/StatementsFixedPeriod.aspx?id=B7879D8CABBF283B38AE447E07A4EA5D8DA9A859&amp;persist=%2fwEPBQ1BY2NvdW50TnVtYmVyBQg4ODU3MjIxOA%3d%3d%7c%2fwEPBQhGcm9tRGF0ZQUTMTgvMDUvMjAxMSAwMDowMDowMA%3d%3d%7c%2fwEPBQhTb3J0Q29kZQUGNjAxNzIx%7c%2fwEPBQZUb0RhdGUFEzE4LzA3LzIwMTEgMDA6MDA6MDA%3d%7c%2fwEWBh4JU1MyQUNDRERBDwUUU0lOSEEgTS9TVFUgODg1NzIyMTgFKEI3ODc5RDhDQUJCRjI4M0IzOEFFNDQ3RTA3QTRFQTVEOERBOUE4NTkeCFNTMlNQRERBDxBkZBYBAgNnHgZTUzJXTEEPAgFo&amp;showall=1" title="Show all items on a single page">All</a>

        logging.debug('RBS checking for all links')

        # find any all link
        links=soup.findAll('a')

        link = None
        for a in links:
            # detect our link
            try:
                if re.search(".tatements.ixed.eriod", a['href']):
                    logging.debug("RBS - got a statement  link")
                    if re.search(".ll", a.text):                        # the one that says all
                        link = self.composeLink(a['href'][:])
                        logging.debug("RBS - got an All statement link")
                        break                                                   # only need the first one so break the for loop
            except Exception, e:
                logging.debug('RBS a link error missing href - ' + str(e))

Example #4

0

Show file

    def doStep1(self, allofit, page):

        body = page

        scrape_result = 'good'
        logging.info("NatWest page1")

        # the following is how you could retrieve the headers from the request
        # for head in allofit['headers']:
        #     name = self.HexToByte(head['name'])
        #     val = self.HexToByte(head['value'])

        # write out the start page
        self.output_page("1_first_page.html", body)

        soup = BeautifulSoup(body)

        frame = soup.find('frame', id='ctl00_secframe')

        if frame != None:

            action = self.urlBase + '/' + frame['src']

            #<frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&amp;cookieid=100714"></frame>

            self.response = {}
            self.response['url'] = self.ByteToHex(action)
            self.response['data'] = ""
            self.response['method'] = 'GET'
            self.response['step'] = 2
        else:
            logging.debug('NatWest frame link error - ')
            scrape_result = 'bank error'

        return scrape_result

Example #5

0

Show file

File: natwest_scraper.py Project: JamieMcNaught/Bank-Scraper

    def doStep1(self, allofit, page):

        body = page

        scrape_result = "good"
        logging.info("NatWest page1")

        # the following is how you could retrieve the headers from the request
        # for head in allofit['headers']:
        #     name = self.HexToByte(head['name'])
        #     val = self.HexToByte(head['value'])

        # write out the start page
        self.output_page("1_first_page.html", body)

        soup = BeautifulSoup(body)

        frame = soup.find("frame", id="ctl00_secframe")

        if frame != None:

            action = self.urlBase + "/" + frame["src"]

            # <frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&amp;cookieid=100714"></frame>

            self.response = {}
            self.response["url"] = self.ByteToHex(action)
            self.response["data"] = ""
            self.response["method"] = "GET"
            self.response["step"] = 2
        else:
            logging.debug("NatWest frame link error - ")
            scrape_result = "bank error"

        return scrape_result

Example #6

0

Show file

File: natwest_scraper.py Project: JamieMcNaught/Bank-Scraper

    def doStep7(self, page):

        # -------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-acclink.html", page)

        loginform = soup.find("form", attrs={"name": "aspnetForm"})

        if loginform == None:
            logging.debug("NatWest no view account form")
            return "bank error"

        action = self.urlBase + "/" + loginform["action"]

        values = self.parseForm(loginform)

        # fill in our selection - 1 month
        values["ctl00$mainContent$SS2SPDDA"] = "M1"

        # default button - needed
        values["ctl00$mainContent$NextButton_button"] = "View Transactions"

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response["url"] = self.ByteToHex(action)
        self.response["data"] = self.ByteToHex(data)
        self.response["method"] = "POST"
        self.response["step"] = 20

        return "good"

Example #7

0

Show file

File: natwest_scraper.py Project: JamieMcNaught/Bank-Scraper

    def doStep2(self, allofit, page):

        # -------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-username.html", page)

        loginform = soup.find("form", attrs={"name": "aspnetForm"})

        if loginform == None:
            logging.debug("NatWest no login form")
            return "bank error"

        action = self.urlBase + "/" + loginform["action"]

        values = self.parseForm(loginform)

        # fill in our credentials
        values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds["01"]  # customer number

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response["url"] = self.ByteToHex(action)
        self.response["data"] = self.ByteToHex(data)
        self.response["method"] = "POST"
        self.response["step"] = 3

        return "good"

Example #8

0

Show file

File: first_direct_scraper.py Project: JamieMcNaught/Bank-Scraper

    def  firstPass(self, page):
        soup = BeautifulSoup(page)
        
        loginform=soup.find('form')
        
        action = loginform['action']
        
        urls = urlparse(action);
        self.urlBase = "https://" + urls.netloc
        logging.info("Base URL = " + self.urlBase)
        
        inputs = loginform.findAllNext('input')
        
        values = {}
        
        values['userid'] = self.filledCreds['03']  #username
        
        # build the body content
        data = urllib.urlencode(values)

        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 2

Example #9

0

Show file

File: seasonList.py Project: Armedite/xbmc-catchuptv-au

 def __init__( self ):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl']))
     print 'Loading ' + self.BASE_URL + unquote_plus(params['showUrl'])
     seasonIndex = BeautifulSoup(source)
     tvshowcontainer = seasonIndex.find('div',id=re.compile('scet_header|scet_top|show-header|show-header-scet|^header$'))
     if tvshowcontainer==None:
         tvshowcontainer=seasonIndex.find('div',{'class':re.compile('scet_header|scet_top|show-header|show-header-scet')})
     if tvshowcontainer!=None:
         tvshowtitle = tvshowcontainer.find('h1').string
     else:
         tvshowtitle = re.search('var siteName = "(.+?)";',source).group(1)
     print 'Parsing seasons for "%s"' % tvshowtitle
     showsListing = seasonIndex.find('div',{"class":re.compile('scet-gallery-nav')}).find('h3',text='Full Episodes').parent.findNextSibling('ul').findAll('li')
     for show in showsListing:
         showLink = show.find('a')
         print 'Found '+showLink.string
         listitem=xbmcgui.ListItem(decode_htmlentities(showLink.string))
         listitem.setInfo('video',{'tvshowtitle':tvshowtitle})
         #listitem.setThumbnailImage(showLink.find('img')['src'])
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL,'')
         xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?seasonUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl),),totalItems=len(showsListing),isFolder=True)
     xbmcplugin.setContent( handle=int(sys.argv[1]), content='seasons')
     xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )

Example #10

0

Show file

    def DoStep3(self, allofit):

        scrape_result = "good"

        page = self.HexToByte(allofit['body'])

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("fd-summary.html", page)

        accountTable = soup.find('table', attrs={'class': 'fdBalancesTable'})

        if accountTable != None:
            self.accountLinks = accountTable.findAll(
                'a', attrs={'class': 'fdActionLink'})

            if len(self.accountLinks) == 0:
                #got some kind of message
                scrape_result = 'bank error'
                logging.info('Still got no accounts')
        else:
            logging.debug("No fd table")
            scrape_result = 'credentials incorrect'

        return scrape_result

Example #11

0

Show file

    def doStep7(self, page):

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-acclink.html", page)

        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no view account form')
            return 'bank error'

        action = self.urlBase + '/' + loginform['action']

        values = self.parseForm(loginform)

        # fill in our selection - 1 month
        values['ctl00$mainContent$SS2SPDDA'] = 'M1'

        # default button - needed
        values['ctl00$mainContent$NextButton_button'] = 'View Transactions'

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 20

        return 'good'

Example #12

0

Show file

    def doStep2(self, allofit, page):

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-username.html", page)

        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no login form')
            return 'bank error'

        action = self.urlBase + '/' + loginform['action']

        values = self.parseForm(loginform)

        # fill in our credentials
        values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds[
            '01']  #customer number

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 3

        return 'good'

Example #13

0

Show file

    def doAllLink(self, page):

        soup = BeautifulSoup(page)

        self.output_page("natwest-xactlist-all-look.html", page)

        #<a href="/StatementsFixedPeriod.aspx?id=B7879D8CABBF283B38AE447E07A4EA5D8DA9A859&amp;persist=%2fwEPBQ1BY2NvdW50TnVtYmVyBQg4ODU3MjIxOA%3d%3d%7c%2fwEPBQhGcm9tRGF0ZQUTMTgvMDUvMjAxMSAwMDowMDowMA%3d%3d%7c%2fwEPBQhTb3J0Q29kZQUGNjAxNzIx%7c%2fwEPBQZUb0RhdGUFEzE4LzA3LzIwMTEgMDA6MDA6MDA%3d%7c%2fwEWBh4JU1MyQUNDRERBDwUUU0lOSEEgTS9TVFUgODg1NzIyMTgFKEI3ODc5RDhDQUJCRjI4M0IzOEFFNDQ3RTA3QTRFQTVEOERBOUE4NTkeCFNTMlNQRERBDxBkZBYBAgNnHgZTUzJXTEEPAgFo&amp;showall=1" title="Show all items on a single page">All</a>

        logging.debug('NatWest checking for all links')

        # find any all link
        links = soup.findAll('a')

        link = None
        for a in links:
            # detect our link
            try:
                if re.search(".tatements.ixed.eriod", a['href']):
                    logging.debug("natwest - got a statement  link")
                    if re.search(".ll", a.text):  # the one that says all
                        link = self.composeLink(a['href'][:])
                        logging.debug("natwest - got an All statement link")
                        break  # only need the first one so break the for loop
            except Exception, e:
                logging.debug('NatWest a link error missing href - ' + str(e))

Example #14

0

Show file

File: rbs_scraper.py Project: JamieMcNaught/Bank-Scraper

    def doStep2(self, allofit, page):
       
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)
        
        self.output_page("RBS-username.html", page)
    
        loginform=soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('RBS no login form')
            return 'bank error'


        action = self.urlBase + '/' + loginform['action']
        
        values = self.parseForm(loginform)
        
        # fill in our credentials
        values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds['01']   #customer number

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 3
        
        return 'good'

Example #15

0

Show file

File: episodeList.py Project: Armedite/xbmc-catchuptv-au

 def __init__( self ):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl']))
     showIndex = BeautifulSoup(source)
     vidInfo = {'tvshowtitle': showIndex.find('div',id='showDashboard').find('span',{'class':'blueText'}).string, 'studio': 'FOX'}
     seasonsListing = showIndex.findAll('div',{'class':re.compile('dashPageHolder'),'id':re.compile('^fullEp')})
     print len(seasonsListing)
     for season in seasonsListing:
         episodesListing = season.findAll('div',{'class':'episodeListing'})
         for episode in episodesListing:
             listitem = xbmcgui.ListItem(episode.find('h3').find('a').string)
             listitem.setThumbnailImage(episode.find('img',id=re.compile('^epThumb'))['src'])
             episodeLink = episode.find('a',{'class':'thumbnailLink'})
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL,'')
             airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'})))
             seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'})))
             episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'})))
             vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2))
             vidInfo['season'] = int(seasonNum.group(1))
             vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             vidInfo['duration'] = episodeNumAndDuration.group(2)
             vidInfo['title'] = episode.find('h3').find('a').string
             vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4))
             print vidInfo
             listitem.setInfo("video",vidInfo)
             xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?episodeUrl=%s" % ( sys.argv[ 0 ], quote_plus(episodeUrl)))
     xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )

Example #16

0

Show file

    def firstPass(self, page):
        soup = BeautifulSoup(page)

        loginform = soup.find('form')

        action = loginform['action']

        urls = urlparse(action)
        self.urlBase = "https://" + urls.netloc
        logging.info("Base URL = " + self.urlBase)

        inputs = loginform.findAllNext('input')

        values = {}

        values['userid'] = self.filledCreds['03']  #username

        # build the body content
        data = urllib.urlencode(values)

        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 2

Example #17

0

Show file

File: first_direct_scraper.py Project: JamieMcNaught/Bank-Scraper

 def DoStep3(self, allofit):
     
     scrape_result = "good"
     
     page = self.HexToByte( allofit['body'])
        
     #-------------------------------- Grab the form values -----------------------------------------------
     soup = BeautifulSoup(page)
     
     self.output_page("fd-summary.html", page)
     
     accountTable=soup.find('table', attrs={'class' : 'fdBalancesTable'})
     
     if accountTable != None:
         self.accountLinks=accountTable.findAll('a', attrs={'class' : 'fdActionLink'})
         
         if len(self.accountLinks) == 0:
             #got some kind of message
             scrape_result = 'bank error'
             logging.info('Still got no accounts')
     else:
         logging.debug("No fd table");
         scrape_result = 'credentials incorrect'
         
     return scrape_result

Example #18

0

Show file

 def __init__(self):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL +
                              unquote_plus(params['showUrl']))
     showIndex = BeautifulSoup(source)
     vidInfo = {
         'tvshowtitle':
         showIndex.find('div',
                        id='showDashboard').find('span', {
                            'class': 'blueText'
                        }).string,
         'studio':
         'FOX'
     }
     seasonsListing = showIndex.findAll(
         'div', {
             'class': re.compile('dashPageHolder'),
             'id': re.compile('^fullEp')
         })
     print len(seasonsListing)
     for season in seasonsListing:
         episodesListing = season.findAll('div',
                                          {'class': 'episodeListing'})
         for episode in episodesListing:
             listitem = xbmcgui.ListItem(
                 episode.find('h3').find('a').string)
             listitem.setThumbnailImage(
                 episode.find('img', id=re.compile('^epThumb'))['src'])
             episodeLink = episode.find('a', {'class': 'thumbnailLink'})
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL, '')
             airedDateAndPlot = re.search(
                 'Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',
                 str(episode.find('div', {'class': 'episodeInfo'})))
             seasonNum = re.search(
                 'Season\s+([0-9]+?)[\s:]',
                 str(episode.find('p', {'class': 'seasonNum'})))
             episodeNumAndDuration = re.search(
                 'Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',
                 str(episode.find('p', {'class': 'episodeNumLine'})))
             vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),
                                              airedDateAndPlot.group(1),
                                              airedDateAndPlot.group(2))
             vidInfo['season'] = int(seasonNum.group(1))
             vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             vidInfo['duration'] = episodeNumAndDuration.group(2)
             vidInfo['title'] = episode.find('h3').find('a').string
             vidInfo['plot'] = decode_htmlentities(
                 airedDateAndPlot.group(4))
             print vidInfo
             listitem.setInfo("video", vidInfo)
             xbmcplugin.addDirectoryItem(
                 handle=int(sys.argv[1]),
                 listitem=listitem,
                 url="%s?episodeUrl=%s" %
                 (sys.argv[0], quote_plus(episodeUrl)))
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Example #19

0

Show file

    def rewrite_html(self, guid, html=None, ajax_url=None):
        """if we are not using ajax, then html is IGNORED and we go by the
		cached copy.  html is sometimes used to see if there should be a
		cached copy at all, or if something goes wrong and we just need to
		return unaltered html
		"""

        guid = str(guid)
        cache_dir = os.path.join(self._store_location, guid_hash(guid))
        mapping_file = os.path.join(cache_dir, guid + "-" + "mapping.pickle")

        if not os.path.isfile(mapping_file):
            # quick and dirty check.  are there images?  if not, plain
            # html is fine
            if html.lower().find('<img') >= 0:
                #logging.warning("Should be downloaded images, but couldn't open mapping.  Recaching")
                self.cache_html(guid, html)
            return html

        try:
            mapping = open(mapping_file, 'r')
            rewrite_hash = pickle.load(mapping)
            non_ajax_html = pickle.load(mapping)
            mapping.close()
        except:
            logging.error("error opening cache pickle for guid %s %s" %
                          (guid, mapping_file))
            logging.error(
                "If you have upgraded penguintv, you might need to delete your image cache"
            )
            return html

        if ajax_url is None:
            return non_ajax_html

        #else, rewrite on the fly
        soup = BeautifulSoup(html)
        img_tags = soup.findAll('img')

        if len(img_tags) == 0:
            return html

        for result in img_tags:
            # believe it or not, some img tags don't have a src, they have an id
            # that points to CSS.  At least I think that's what's going on
            if result.has_key('src'):
                if rewrite_hash.has_key(result['src']):
                    if rewrite_hash[result['src']][1] == UrlCacher.DOWNLOADED:
                        #if os.path.isfile(os.path.join(self._store_location, rewrite_hash[result['src']][0])):
                        result['src'] = ajax_url + "/cache/" + rewrite_hash[
                            result['src']][0]
                        #else:
                        #	logging.warning("file not found, not replacing")
                        #	logging.debug("(should we attempt to recache here?")

        return soup.prettify()

Example #20

0

Show file

    def _parseNatWestLinks(self, raw):
        soup = BeautifulSoup(raw)
        accountBLock = soup.findAll('a', attrs={'class': 'accountNameExpand'})

        # got some acount details now so all good
        if len(accountBLock) == 0:
            logging.warning('NatWest no accounts after continue form')
            return 'account problem'

        for ac_link in accountBLock:
            ac_link.string = ac_link.text
            self.accountLinks.append(ac_link)

            # now the accnum list - to get the pair data, cos cant get it from link
            row = ac_link.parent.parent
            try:
                # find the account number span
                acnumSpan = row.find('span', attrs={'class': 'AccountNumber'})
                acnum = acnumSpan.text
                acnum = acnum.replace(' ', '')

                # find the sort code span
                sortSpan = row.find('span', attrs={'class': 'SortCode'})
                sortc = sortSpan.text
                sortc = sortc.replace(' ', '')
                sortc = sortc.replace('-', '')
            except Exception, e:
                logging.exception('NatWest form error - ' + str(e))
                return 'bank error'

            #combine the two - to be our matching number
            num = sortc + "-" + acnum

            actype = 'Cheque'
            # might be a credit card
            if len(acnum) > 14:
                actype = 'Credit'

            # now get balances...
            balance = 0
            baltr = ac_link.parent.parent
            baltds = baltr.findAll('td')
            if len(baltds) > 2:
                baltext = self.tidy_text(baltds[3].text)
                balance = self.normalise_ammount(baltext)

            # and add it to our account list
            acpair = {
                'name': ac_link.text,
                'num': num,
                'type': actype,
                'bal': balance
            }

            self.myAccounts.append(acpair)

Example #21

0

Show file

    def doStep3(self, allofit, page):

        scrape_result = "good"

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("natwest-security.html", page)

        scrape_result = 'good'
        logging.info("NatWest security page2")

        # check if we got returned
        # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on
        errorDiv = soup.findAll(
            'input', attrs={'name': 'ctl00$mainContent$LI5TABA$DBID_edit'})

        if len(errorDiv) != 0:
            logging.info("NatWest security page1 still - customer number bad")
            return 'credentials incorrect'  # if we get here then the form was found hence creds must be wrong

        # find our form
        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no security form')
            return 'bank error'

        values = self.parseForm(loginform)

        # define some variables that would only otherwise exist in a try catch block scope
        # the label text split on spaces
        which1arr = ""
        which2arr = ""
        which3arr = ""

        # the chalenges
        firstDigit = ""
        secondDigit = ""
        thirdDigit = ""

        #>>>>>>> The first set of Pin fields
        #-------------------- get the questions --------------#

        #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label>
        useNewTab = False
        try:
            which1 = soup.find('label',
                               attrs={
                                   'for': 'ctl00_mainContent_LI6PPEA_edit'
                               }).text
        except Exception, e:
            useNewTab = True

Example #22

0

Show file

File: showList.py Project: Armedite/xbmc-catchuptv-au

 def __init__( self ):
     source = self._fetch_url(self.BASE_FOD_URL)
     fodIndex = BeautifulSoup(source)
     showsListing = fodIndex.find('div',id='episodes-listing').findAll('li')
     for show in showsListing:
         showLink = show.find('a')
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL,'')
         xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=xbmcgui.ListItem(showLink.string),url="%s?showUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl)),totalItems=len(showsListing),isFolder=True)
     xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )

Example #23

0

Show file

File: OfflineImageCache.py Project: ywwg/penguintv

	def rewrite_html(self, guid, html=None, ajax_url=None):
		"""if we are not using ajax, then html is IGNORED and we go by the
		cached copy.  html is sometimes used to see if there should be a
		cached copy at all, or if something goes wrong and we just need to
		return unaltered html
		"""

		guid = str(guid)
		cache_dir = os.path.join(self._store_location, guid_hash(guid))
		mapping_file = os.path.join(cache_dir, guid + "-" + "mapping.pickle")

		if not os.path.isfile(mapping_file):
			# quick and dirty check.  are there images?  if not, plain
			# html is fine
			if html.lower().find('<img') >= 0:
				#logging.warning("Should be downloaded images, but couldn't open mapping.  Recaching")
				self.cache_html(guid, html)
			return html

		try:
			mapping = open(mapping_file, 'r')
			rewrite_hash = pickle.load(mapping)
			non_ajax_html = pickle.load(mapping)
			mapping.close()
		except:
			logging.error("error opening cache pickle for guid %s %s" % (guid, mapping_file))
			logging.error("If you have upgraded penguintv, you might need to delete your image cache")
			return html

		if ajax_url is None:
			return non_ajax_html

		#else, rewrite on the fly
		soup = BeautifulSoup(html)
		img_tags = soup.findAll('img')

		if len(img_tags) == 0:
			return html

		for result in img_tags:
			# believe it or not, some img tags don't have a src, they have an id
			# that points to CSS.  At least I think that's what's going on
			if result.has_key('src'):
				if rewrite_hash.has_key(result['src']):
					if rewrite_hash[result['src']][1] == UrlCacher.DOWNLOADED:
						#if os.path.isfile(os.path.join(self._store_location, rewrite_hash[result['src']][0])):
						result['src'] = ajax_url + "/cache/" + rewrite_hash[result['src']][0]
						#else:
						#	logging.warning("file not found, not replacing")
						#	logging.debug("(should we attempt to recache here?")

		return soup.prettify()

Example #24

0

Show file

File: rbs_scraper.py Project: JamieMcNaught/Bank-Scraper

    def _parseRBSLinks(self, raw):
        soup = BeautifulSoup(raw)
        accountBLock=soup.findAll('a', attrs={'class' : 'accountNameExpand'})

        # got some acount details now so all good
        if len(accountBLock) == 0:
            logging.warning('RBS no accounts after continue form')
            return 'account problem'

        for ac_link in accountBLock:
            ac_link.string = ac_link.text
            self.accountLinks.append(ac_link)


            # now the accnum list - to get the pair data, cos cant get it from link
            row = ac_link.parent.parent
            try:
                # find the account number span
                acnumSpan = row.find('span', attrs={'class': 'AccountNumber'})
                acnum = acnumSpan.text
                acnum = acnum.replace(' ', '')

                # find the sort code span
                sortSpan = row.find('span', attrs={'class': 'SortCode'})
                sortc = sortSpan.text
                sortc = sortc.replace(' ', '')
                sortc = sortc.replace('-', '')
            except Exception, e:
                logging.exception('RBS form error - ' + str(e))
                return 'bank error'

            #combine the two - to be our matching number
            num = sortc + "-" + acnum

            actype =  'Cheque'
            # might be a credit card
            if len(acnum) > 14:
                actype =  'Credit'

            # now get balances...
            balance = 0
            baltr = ac_link.parent.parent
            baltds = baltr.findAll('td')
            if len(baltds) > 2:
                baltext = self.tidy_text(baltds[3].text)
                balance = self.normalise_ammount(baltext)

            # and add it to our account list
            acpair = {'name': ac_link.text, 'num': num, 'type': actype, 'bal': balance}

            self.myAccounts.append(acpair)

Example #25

0

Show file

File: rbs_scraper.py Project: JamieMcNaught/Bank-Scraper

    def doStep3(self, allofit, page):
        
        scrape_result = "good"
        
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("RBS-security.html", page)
        
        scrape_result = 'good'
        logging.info("RBS security page2")

        # check if we got returned
        # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on
        errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI5TABA$DBID_edit'})

        if len(errorDiv) != 0:
            logging.info("RBS security page1 still - customer number bad")
            return  'credentials incorrect'   # if we get here then the form was found hence creds must be wrong
        
        
        # find our form
        loginform=soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('RBS no security form')
            return 'bank error'

        values = self.parseForm(loginform)

        # define some variables that would only otherwise exist in a try catch block scope
        # the label text split on spaces
        which1arr = ""
        which2arr = ""
        which3arr = ""

        # the chalenges
        firstDigit  = ""
        secondDigit = ""
        thirdDigit  = ""

        #>>>>>>> The first set of Pin fields
        #-------------------- get the questions --------------#

        #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label>
        useNewTab = False
        try:
            which1=soup.find('label', attrs={'for' : 'ctl00_mainContent_LI6PPEA_edit'}).text
        except Exception, e:
            useNewTab = True

Example #26

0

Show file

File: ugacads.py Project: saketkc/ug_acads

 def get(self):
     bookname  = self.request.get('search')
     req = isbndbpy.Request('books', 'combined', str(bookname))
     resp = req.send().read()
     #print resp
     soup = BS(str(resp))
     books = soup.findAll('bookdata')
     self.response.out.write('<html><body>')
     for bookdata in books:
         self.response.out.write('<br/>Title: '+ str(bookdata.find('title').string ))
         self.response.out.write('<br/>ISBN: ' +str(bookdata.get('isbn13')))
         self.response.out.write('<br/>AUTHOR :'+str( bookdata.find('authorstext').string))
         self.response.out.write('<br/>PUBLISEHR: '+str(bookdata.find('publishertext').string))
         self.response.out.write('<br/> "***********')

Example #27

0

Show file

File: natwest_scraper.py Project: JamieMcNaught/Bank-Scraper

    def _parseNatWestLinks(self, raw):
        soup = BeautifulSoup(raw)
        accountBLock = soup.findAll("a", attrs={"class": "accountNameExpand"})

        # got some acount details now so all good
        if len(accountBLock) == 0:
            logging.warning("NatWest no accounts after continue form")
            return "account problem"

        for ac_link in accountBLock:
            ac_link.string = ac_link.text
            self.accountLinks.append(ac_link)

            # now the accnum list - to get the pair data, cos cant get it from link
            row = ac_link.parent.parent
            try:
                # find the account number span
                acnumSpan = row.find("span", attrs={"class": "AccountNumber"})
                acnum = acnumSpan.text
                acnum = acnum.replace(" ", "")

                # find the sort code span
                sortSpan = row.find("span", attrs={"class": "SortCode"})
                sortc = sortSpan.text
                sortc = sortc.replace(" ", "")
                sortc = sortc.replace("-", "")
            except Exception, e:
                logging.exception("NatWest form error - " + str(e))
                return "bank error"

            # combine the two - to be our matching number
            num = sortc + "-" + acnum

            actype = "Cheque"
            # might be a credit card
            if len(acnum) > 14:
                actype = "Credit"

            # now get balances...
            balance = 0
            baltr = ac_link.parent.parent
            baltds = baltr.findAll("td")
            if len(baltds) > 2:
                baltext = self.tidy_text(baltds[3].text)
                balance = self.normalise_ammount(baltext)

            # and add it to our account list
            acpair = {"name": ac_link.text, "num": num, "type": actype, "bal": balance}

            self.myAccounts.append(acpair)

Example #28

0

Show file

File: seasonList.py Project: andy13th/xbmc-catchuptv-au

 def __init__(self):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL +
                              unquote_plus(params['showUrl']))
     print 'Loading ' + self.BASE_URL + unquote_plus(params['showUrl'])
     seasonIndex = BeautifulSoup(source)
     tvshowcontainer = seasonIndex.find(
         'div',
         id=re.compile(
             'scet_header|scet_top|show-header|show-header-scet|^header$'))
     if tvshowcontainer == None:
         tvshowcontainer = seasonIndex.find(
             'div', {
                 'class':
                 re.compile(
                     'scet_header|scet_top|show-header|show-header-scet')
             })
     if tvshowcontainer != None:
         tvshowtitle = tvshowcontainer.find('h1').string
     else:
         tvshowtitle = re.search('var siteName = "(.+?)";', source).group(1)
     print 'Parsing seasons for "%s"' % tvshowtitle
     showsListing = seasonIndex.find('div', {
         "class": re.compile('scet-gallery-nav')
     }).find(
         'h3',
         text='Full Episodes').parent.findNextSibling('ul').findAll('li')
     for show in showsListing:
         showLink = show.find('a')
         print 'Found ' + showLink.string
         listitem = xbmcgui.ListItem(decode_htmlentities(showLink.string))
         listitem.setInfo('video', {'tvshowtitle': tvshowtitle})
         #listitem.setThumbnailImage(showLink.find('img')['src'])
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL, '')
         xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]),
                                     listitem=listitem,
                                     url="%s?seasonUrl=%s" % (
                                         sys.argv[0],
                                         quote_plus(showUrl),
                                     ),
                                     totalItems=len(showsListing),
                                     isFolder=True)
     xbmcplugin.setContent(handle=int(sys.argv[1]), content='seasons')
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Example #29

0

Show file

    def DoStep2(self, allofit):

        page = self.HexToByte(allofit['body'])

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("fd-username.html", page)

        loginform = soup.find('form')

        action = loginform['action']

        inputs = loginform.findAllNext('input')

        values = {}

        self.response = {}

        # build the post values up - there arent any others afaik

        ps = loginform.findAllNext('p')

        numbers = ps[1].findAllNext('strong')

        #not enough lookup digits
        try:
            password = self.lookupdigit(numbers[0].text) + self.lookupdigit(
                numbers[1].text) + self.lookupdigit(numbers[2].text)
        except:
            logging.debug("credentials incorrect")
            return 'credentials incorrect'

        answer = self.filledCreds['06']

        values['password'] = password
        values['memorableAnswer'] = answer

        # build the body content
        data = urllib.urlencode(values)

        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 3

        return 'good'

Example #30

0

Show file

    def __init__(self, prontuario_html):
        '''
        Constructor
        '''

        self.__soup = \
            BeautifulSoup(prontuario_html, fromEncoding='iso-8859-1',
                          convertEntities=BeautifulSoup.HTML_ENTITIES)
        self.__prontuario = {}

        self._parsear_dados_veiculo()
        self._parsear_debitos()
        self._parsear_infracoes_em_autuacao()
        self._parsear_listagem_multas()
        self._parsear_historico_multas()
        self._parsear_ultimo_processo()
        self._parsear_recurso_infracao()

Example #31

0

Show file

File: first_direct_scraper.py Project: JamieMcNaught/Bank-Scraper

 def DoStep2(self, allofit):
    
     page = self.HexToByte( allofit['body'])
        
     #-------------------------------- Grab the form values -----------------------------------------------
     soup = BeautifulSoup(page)
     
     self.output_page("fd-username.html", page)
 
     loginform=soup.find('form')
 
     action = loginform['action']
     
     inputs = loginform.findAllNext('input')
     
     values = {}
     
     self.response = {}
     
     # build the post values up - there arent any others afaik
             
     ps = loginform.findAllNext('p')
             
     numbers = ps[1].findAllNext('strong')
     
     #not enough lookup digits
     try:
         password =  self.lookupdigit(numbers[0].text) + self.lookupdigit(numbers[1].text) + self.lookupdigit(numbers[2].text) 
     except:
         logging.debug("credentials incorrect")
         return 'credentials incorrect'
     
     answer = self.filledCreds['06']
     
     values['password'] = password
     values['memorableAnswer'] = answer
     
     # build the body content
     data = urllib.urlencode(values)
     
     self.response['url'] = self.ByteToHex(action)
     self.response['data'] = self.ByteToHex(data)
     self.response['method'] = 'POST'
     self.response['step'] = 3
     
     return 'good'

Example #32

0

Show file

 def _get_soup(self, html):
     try:
         return BeautifulSoup(html)
     except:
         logging.warning(
             "BeautifulSoup exception cleaning up html, can't cache images offline"
         )
         return None

Example #33

0

Show file

    def _parseComment(self, communityId, liveInfoFilePath, commentFilePath):
        chatList = []
        if not (os.path.exists(liveInfoFilePath) and os.path.exists(commentFilePath)):
            return chatList

        infoParser = BeautifulSoup(open(liveInfoFilePath, u'r'))
        if not infoParser.find(u'communityid').renderContents() == communityId:
            return chatList

        commentParser = BeautifulSoup(open(commentFilePath, u'r'))
        chatTagList = commentParser.findAll(u'chat', attrs={u'msgkind': u'message_msg'})
        for chatTag in chatTagList:
            communityId = communityId.decode(u'utf-8')
            liveId = infoParser.find(u'liveid').renderContents().decode()
            userId = chatTag.get(u'user').decode(u'utf-8')
            name = chatTag.get(u'nickname').decode(u'utf-8')
            message = chatTag.renderContents().decode(u'utf-8')
            option = chatTag.get(u'mail').decode(u'utf-8') if chatTag.get(u'mail') != '' else None
            date = re.sub(
                ur'(\d{4})/(\d{1,2})/(\d{1,2})\s(\d{1,2}):(\d{1,2}):(\d{1,2})',
                lambda match: u'{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}'.format(int(match.group(1)), int(match.group(2)), int(match.group(3)), int(match.group(4)), int(match.group(5)), int(match.group(6))),
                chatTag.get(u'date')
            ).decode(u'utf-8')
            chatList.append((communityId, liveId, userId, name, message, option, date))

        return chatList

Example #34

0

Show file

 def __init__(self):
     source = self._fetch_url(self.BASE_FOD_URL)
     fodIndex = BeautifulSoup(source)
     showsListing = fodIndex.find('div',
                                  id='episodes-listing').findAll('li')
     for show in showsListing:
         showLink = show.find('a')
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL, '')
         xbmcplugin.addDirectoryItem(
             handle=int(sys.argv[1]),
             listitem=xbmcgui.ListItem(showLink.string),
             url="%s?showUrl=%s" % (sys.argv[0], quote_plus(showUrl)),
             totalItems=len(showsListing),
             isFolder=True)
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Example #35

0

Show file

File: rbs_scraper.py Project: JamieMcNaught/Bank-Scraper

    def doStep12(self, page):
       
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)
        
        self.output_page("RBS-xactlist-cc-poss.html", page)
    
        rightButtons=soup.findAll('a', attrs={'class' : 'link-button-right'})

        # any buttons?
        if len(rightButtons) == 0:
            logging.error('RBS no cc accountbuttons')
            return 'bank error'

        # RBS is not dynamic -so this static list is fine (unlike Smile)
        acLink = None
        for a in rightButtons:
            # filter out the account detail buttons matching just the statement buttons
            # Bloody hope this regex finds shit in the right order
            if re.search(".ard.tatement.etail", a['href']):
                acLink = a['href'][:]
                



        if acLink == None:
            logging.debug('RBS no cc detail link')
            return 'bank error'

        

        # action = self.urlBase + '/' + loginform['action']

        action = acLink
        try:
            logging.debug("checking link - " + acLink)
            urls = urlparse(acLink);

            # if it parses properly good else    
            
        
        except Exception, e:
            logging.error('RBS cc link error - ' + str(e))
            action = self.urlBase + '/' + acLink

Example #36

0

Show file

File: episodeList.py Project: Armedite/xbmc-catchuptv-au

 def __init__( self ):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL + unquote_plus(params['seasonUrl']))
     showIndex = BeautifulSoup(source)
     tvshowcontainer = showIndex.find('div',id=re.compile('scet_header|scet_top|show-header|show-header-scet|^header$'))
     if tvshowcontainer==None:
         tvshowcontainer=showIndex.find('div',{'class':re.compile('scet_header|scet_top|show-header|show-header-scet')})
     if tvshowcontainer!=None:
         tvshowtitle = tvshowcontainer.find('h1').string
     else:
         tvshowtitle = re.search('var siteName = "(.+?)";',source).group(1)
     pages = 1
     if showIndex.find('div',{'class':re.compile('nbcu_pager')}):
         pageLinks = showIndex.find('div',{'class':re.compile('nbcu_pager')}).findAll('a',{'class':re.compile('nbcu_pager_page')})
         pages = len(pageLinks)
     for i in range(0,pages):
         if i>0:
             source = self._fetch_url(self.BASE_URL + pageLinks[i]['href'])
             showIndex = BeautifulSoup(source)
         episodesListing = showIndex.find('ul',{'class':re.compile('scet_th_full')}).findAll('li')
         for episode in episodesListing:
             vidInfo = {'tvshowtitle': tvshowtitle, 'studio': 'NBC'}
             title = decode_htmlentities(episode.find('p',{'class':re.compile('list_full_det_title')}).find('a').string)
             listitem = xbmcgui.ListItem(title)
             listitem.setThumbnailImage(episode.find('img')['src'])
             episodeLink = episode.find('a')
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL,'')
             if episode.find('p',{'class':re.compile('list_full_des')}):
                 vidInfo['plot'] = decode_htmlentities(episode.find('p',{'class':re.compile('list_full_des')}).find('em').string)
             epNum = re.search('^Ep(?:\.\s*)?([0-9]{1,2})([0-9][0-9])(?:\s*:\s*)?(.+)$',title)
             if epNum != None:
                 vidInfo['season'] = int(epNum.group(1))
                 vidInfo['episode'] = int(epNum.group(2))
             vidInfo['title'] = epNum.group(3)
             #airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'})))
             #seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'})))
             #episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'})))
             #vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2))
             #vidInfo['season'] = int(seasonNum.group(1))
             #vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             #vidInfo['duration'] = episodeNumAndDuration.group(2)
             #vidInfo['title'] = episode.find('h3').find('a').string
             #vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4))
             #print vidInfo
             listitem.setInfo("video",vidInfo)
             xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?episodeUrl=%s&episode=%s&season=%s" % ( sys.argv[ 0 ], quote_plus(episodeUrl),vidInfo['episode'],vidInfo['season']))
     xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_EPISODE )
     xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_DATE )
     xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_LABEL )
     xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_DURATION )
     xbmcplugin.setContent( handle=int(sys.argv[1]), content='episodes')
     xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )

Example #37

0

Show file

File: rbs_scraper.py Project: JamieMcNaught/Bank-Scraper

    def doStep1(self, allofit, page):
        
        body = page
        
        scrape_result = 'good'
        logging.info("RBS page1")


        # the following is how you could retrieve the headers from the request
        # for head in allofit['headers']:
        #     name = self.HexToByte(head['name'])
        #     val = self.HexToByte(head['value'])
        
        
        # write out the start page
        self.output_page("1_first_page.html", body)


        soup = BeautifulSoup(body);

        frame = soup.find('frame', id='ctl00_secframe');

        if frame != None:

            action = self.urlBase + '/' + frame['src'];

            #<frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&amp;cookieid=100714"></frame>
        
            self.response = {}
            self.response['url'] = self.ByteToHex(action)
            self.response['data'] = ""
            self.response['method'] = 'GET'
            self.response['step'] = 2
        else:
            logging.debug('RBS frame link error - ')
            scrape_result = 'bank error'

        
        return scrape_result

Example #38

0

Show file

    def _loadComment(self, communityId, userSettingFilePath, commentLogFolder):
        nameDict = self._loadUserSetting(communityId, userSettingFilePath)
        commentLogFileList = filter(lambda file: re.match(ur'ncvLog_lv\d+-{0}\.xml$'.format(communityId), file) , os.listdir(commentLogFolder))

        chatList = []
        for commentFile in commentLogFileList:
            parser = BeautifulSoup(open(os.path.join(commentLogFolder, commentFile), u'r'))
            liveId = u'lv' + parser.find(u'livenum').renderContents().decode(u'utf-8')
            chatTagList = parser.find(u'livecommentdataarray').findAll(u'chat', recursive=False)
            for chatTag in chatTagList:
                userId = chatTag.get(u'user_id')
                if chatTag.get(u'user_id') == u'':
                    continue

                name = nameDict.get(userId)
                message = chatTag.renderContents().decode(u'utf-8')
                option = chatTag.get(u'mail')
                unixtime = time.localtime(int(chatTag.get(u'date')))
                date = (datetime.datetime(*unixtime[:-3]).strftime(u'%Y-%m-%d %H:%M:%S') if unixtime else None).decode(u'utf-8')
                chatList.append((communityId, liveId, userId, name, message, option, date))

        return chatList

Example #39

0

Show file

    def doStep12(self, page):

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-xactlist-cc-poss.html", page)

        rightButtons = soup.findAll('a', attrs={'class': 'link-button-right'})

        # any buttons?
        if len(rightButtons) == 0:
            logging.error('NatWest no cc accountbuttons')
            return 'bank error'

        # natWest is not dynamic -so this static list is fine (unlike Smile)
        acLink = None
        for a in rightButtons:
            # filter out the account detail buttons matching just the statement buttons
            # Bloody hope this regex finds shit in the right order
            if re.search(".ard.tatement.etail", a['href']):
                acLink = a['href'][:]

        if acLink == None:
            logging.debug('NatWest no cc detail link')
            return 'bank error'

        # action = self.urlBase + '/' + loginform['action']

        action = acLink
        try:
            logging.debug("checking link - " + acLink)
            urls = urlparse(acLink)

            # if it parses properly good else

        except Exception, e:
            logging.error('NatWest cc link error - ' + str(e))
            action = self.urlBase + '/' + acLink

Example #40

0

Show file

    def __makerequest(self, cmd, **kwargs):
        kwargs["cmd"] = cmd
        if self._token:
            kwargs["token"] = self._token

        try:
            response = BeautifulSoup(
                self._opener.open(self._url + urllib.urlencode(
                    dict([
                        k,
                        v.encode('utf-8') if isinstance(v, basestring) else v
                    ] for k, v in kwargs.items())))).response
        except urllib2.URLError, e:
            raise FogBugzConnectionError(e)

Example #41

0

Show file

File: route_handling.py Project: pursh2002/Web-Scraping

    def read_timetable_file(self, route_filename, weekday, direction):
        # Example of how the timetables can be read and returned as a Map

        from BeautifulSoup.BeautifulSoup import BeautifulSoup
        import urllib2, re, time

        filestr = ('data/timetables/%s_%s_%s.html' %
                   (route_filename, weekday, direction))
        fil = open(filestr, "r")
        soup = BeautifulSoup(fil.read(), fromEncoding='utf8')
        fil.close()

        divs = soup.html.body.findAll('div')
        children = divs[0].contents

        #timetable
        tt = children[1].contents[3].contents[3].contents[3].contents[
            1].contents[2]

        route_list = []
        route_times_list = []
        # stop names values
        for (j, name) in enumerate(tt.contents[0].contents[4].contents):

            route_times_list = []
            route_name = name.contents[1].find('a').contents[0]
            print route_name
            #am / pm values
            for (i, name) in enumerate(
                    tt.contents[0].contents[3].contents[2].contents):
                time_value = tt.contents[0].contents[5].contents[
                    j + 1].contents[i].contents[0].text
                if time_value == '-':
                    print time_value
                    continue
                time_prefix = name.text
                #values minus the first
                time_str = '' + time_value + ' ' + time_prefix
                try:
                    time_result = time.strftime(
                        '%H:%M:%S', time.strptime(time_str, '%I:%M %p'))
                    route_times_list.append(time_result)
                    print time_result
                except:
                    print "ERR", time_str

            route_list.append((route_name, route_times_list))

        return route_list

Example #42

0

Show file

File: showList.py Project: Armedite/xbmc-catchuptv-au

 def __init__( self ):
     print 'Fetching %s' % self.INDEX_URL
     source = self._fetch_url(self.INDEX_URL)
     fodIndex = BeautifulSoup(source)
     showsListing = fodIndex.find('div',{"class":re.compile('group-full-eps')}).findAll('li')
     print 'Parsed listing and found %d shows' % len(showsListing)
     for show in showsListing:
         showLink = show.find('a')
         listitem=xbmcgui.ListItem(decode_htmlentities(showLink['title']))
         episodeCount = show.find('div',text=re.compile('^[0-9]+ Videos?$'))
         if episodeCount:
             episodeCount = int(re.search('^([0-9]+)\s*Videos?$',episodeCount.string).group(1))
             print 'Found "%s" with %d episodes' % (decode_htmlentities(showLink['title']),episodeCount)
             listitem.setInfo('video',{'episode':episodeCount})
         else: 
             print 'Found "%s" but did not find how many episodes' % decode_htmlentities(showLink['title'])
         listitem.setThumbnailImage(showLink.find('img')['src'])
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL,'')
         xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?showUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl)),totalItems=len(showsListing),isFolder=True)
     xbmcplugin.setContent(handle=int(sys.argv[1]), content='tvshows')
     xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )

Example #43

0

Show file

 def __init__(self):
     print 'Fetching %s' % self.INDEX_URL
     source = self._fetch_url(self.INDEX_URL)
     fodIndex = BeautifulSoup(source)
     showsListing = fodIndex.find('div', {
         "class": re.compile('group-full-eps')
     }).findAll('li')
     print 'Parsed listing and found %d shows' % len(showsListing)
     for show in showsListing:
         showLink = show.find('a')
         listitem = xbmcgui.ListItem(decode_htmlentities(showLink['title']))
         episodeCount = show.find('div',
                                  text=re.compile('^[0-9]+ Videos?$'))
         if episodeCount:
             episodeCount = int(
                 re.search('^([0-9]+)\s*Videos?$',
                           episodeCount.string).group(1))
             print 'Found "%s" with %d episodes' % (decode_htmlentities(
                 showLink['title']), episodeCount)
             listitem.setInfo('video', {'episode': episodeCount})
         else:
             print 'Found "%s" but did not find how many episodes' % decode_htmlentities(
                 showLink['title'])
         listitem.setThumbnailImage(showLink.find('img')['src'])
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL, '')
         xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]),
                                     listitem=listitem,
                                     url="%s?showUrl=%s" %
                                     (sys.argv[0], quote_plus(showUrl)),
                                     totalItems=len(showsListing),
                                     isFolder=True)
     xbmcplugin.setContent(handle=int(sys.argv[1]), content='tvshows')
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Example #44

0

Show file

File: prontuario_veiculo.py Project: pedropaulovc/APIDetranSC

 def __init__(self, prontuario_html):
     '''
     Constructor
     '''
     
     self.__soup = \
         BeautifulSoup(prontuario_html, fromEncoding='iso-8859-1', 
                       convertEntities=BeautifulSoup.HTML_ENTITIES)
     self.__prontuario = {}
     
     self._parsear_dados_veiculo()
     self._parsear_debitos()
     self._parsear_infracoes_em_autuacao()
     self._parsear_listagem_multas()
     self._parsear_historico_multas()
     self._parsear_ultimo_processo()
     self._parsear_recurso_infracao()

Example #45

0

Show file

    def __init__(self, url, token=None):
        self.__handlerCache = {}
        if not url.endswith('/'):
            url += '/'

        if token:
            self._token = token.encode('utf-8')
        else:
            self_token = None

        self._opener = urllib2.build_opener()
        try:
            soup = BeautifulSoup(self._opener.open(url + 'api.xml'))
        except URLError:
            raise FogBugzConnectionError(
                "Library could not connect to the FogBugz API.  Either this installation of FogBugz does not support the API, or the url, %s, is incorrect."
                % (self._url, ))
        self._url = url + soup.response.url.string
        self.currentFilter = None

Example #46

0

Show file

 def _loadUserSetting(self, communityId, userSettingFilePath):
     parser = BeautifulSoup(open(userSettingFilePath, u'r'))
     nameTagList = parser.findAll(u'user', attrs={ u'community': communityId, u'name': True })
     return dict(map(lambda tag: (tag.renderContents(), tag.get(u'name')), nameTagList))

Example #47

0

Show file

File: rbs_scraper.py Project: JamieMcNaught/Bank-Scraper

    def doStep4(self, allofit, page):
        
        scrape_result = "good"
        
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)
        
        # write out the start page
        self.output_page("RBS-pos-accounts.html", page)
        
        scrape_result = 'good'
        logging.info("RBS message or bad cred check ")

        # if we still have the input then def bad credentials 
        errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI6PPEA_edit'})

        if len(errorDiv) != 0:
            logging.info("RBS defiantely bad credentials")
            return 'credentials incorrect' 


        accountBLock=soup.findAll('table', attrs={'class' : 'AccountTable'})
        # got some acount details so all good
        if len(accountBLock) > 0:
            logging.debug("RBS defiantely got some good accounts")
            return 'good';

        # find any link

        # if we find a link return it 

        # check for the normal continue button and fail all else - with credentials failure
        continueButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_FinishButton_button'})

        if(continueButton == None):
            logging.warning("RBS cant find finish button credentials incorrect")

            nextButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_NextButton_button'})

            if(nextButton == None):
                logging.warning("RBS cant find next button either")
                return  'credentials incorrect'


        # now find the form that these buttons belong to
        loginform=soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('RBS no continue form')
            return 'bank error'

        action = self.urlBase + '/' + loginform['action']
    
        # any hidden values etc        
        values = self.parseForm(loginform)
        

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 4
        
        return 'messages'

Example #48

0

Show file

    def _processCCAccount(self, raw, account_path, balance):
        soup = BeautifulSoup(raw)

        logging.debug('CC ac path - ' + str(account_path) + ' - end')

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug('Processing :) ')

                builder = StatementBuilder(self.facade, account_path,
                                           self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[
                    self.current_statement]

                # we know this is not a credit card
                isCCard = True

                # get a fixed balance somewhere??
                # passed in for natwest

                # set up our statement
                self.statementbuilder.make_recent_dif_statement(
                    'NatWest-recent', 'Scraper', None)

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)

                # now find all the recent transactions
                x_table = soup.find('table', attrs={'class': 'ItemTable'})

                if x_table != None:
                    x_body = x_table.find('tbody')
                    inputs = x_body.findAll('tr')

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll('td')
                        if vals:
                            datebit = ''
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:
                                    dp = DateParser()
                                    try:
                                        atts['date'] = dp.ymd_from_date(
                                            dp.date_from_small(data))
                                    except:
                                        atts['date'] == ''

                                if i == 1:
                                    datebit = data[:-5]

                                if i == 2:
                                    if data != 'SALE':  # only keep extra xact date for Sales
                                        datebit = ''

                                if i == 3:
                                    if data != "":
                                        atts['display'] = " ".join(
                                            data.split()).encode('utf8')
                                        atts['extradisplay'] = datebit.encode(
                                            'utf8')

                                if i > 3:  # the numbers

                                    if data != "" and data != '-':
                                        amount = self.normalise_ammount(data)

                                        if i == 4:
                                            atts['amount'] = amount
                                            atts['type'] = 'Credit'

                                        if i == 5:
                                            atts['amount'] = amount
                                            atts['type'] = 'Debit'

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)

            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception('NatWest parsing error - ' + str(e))

Example #49

0

Show file

    def doStep4(self, allofit, page):

        scrape_result = "good"

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("natwest-pos-accounts.html", page)

        scrape_result = 'good'
        logging.info("NatWest message or bad cred check ")

        # if we still have the input then def bad credentials
        errorDiv = soup.findAll(
            'input', attrs={'name': 'ctl00$mainContent$LI6PPEA_edit'})

        if len(errorDiv) != 0:
            logging.info("NatWest defiantely bad credentials")
            return 'credentials incorrect'

        accountBLock = soup.findAll('table', attrs={'class': 'AccountTable'})
        # got some acount details so all good
        if len(accountBLock) > 0:
            logging.debug("NatWest defiantely got some good accounts")
            return 'good'

        # find any link

        # if we find a link return it

        # check for the normal continue button and fail all else - with credentials failure
        continueButton = soup.find(
            'input', attrs={'id': 'ctl00_mainContent_FinishButton_button'})

        if (continueButton == None):
            logging.warning(
                "NatWest cant find finish button credentials incorrect")

            nextButton = soup.find(
                'input', attrs={'id': 'ctl00_mainContent_NextButton_button'})

            if (nextButton == None):
                logging.warning("NatWest cant find next button either")
                return 'credentials incorrect'

        # now find the form that these buttons belong to
        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no continue form')
            return 'bank error'
        else:
            logging.debug('found a continue form - so clicking it')
        action = self.urlBase + '/' + loginform['action']

        # any hidden values etc
        values = self.parseForm(loginform)

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 4

        return 'messages'

Example #50

0

Show file

 def __init__(self):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL +
                              unquote_plus(params['seasonUrl']))
     showIndex = BeautifulSoup(source)
     tvshowcontainer = showIndex.find(
         'div',
         id=re.compile(
             'scet_header|scet_top|show-header|show-header-scet|^header$'))
     if tvshowcontainer == None:
         tvshowcontainer = showIndex.find(
             'div', {
                 'class':
                 re.compile(
                     'scet_header|scet_top|show-header|show-header-scet')
             })
     if tvshowcontainer != None:
         tvshowtitle = tvshowcontainer.find('h1').string
     else:
         tvshowtitle = re.search('var siteName = "(.+?)";', source).group(1)
     pages = 1
     if showIndex.find('div', {'class': re.compile('nbcu_pager')}):
         pageLinks = showIndex.find('div', {
             'class': re.compile('nbcu_pager')
         }).findAll('a', {'class': re.compile('nbcu_pager_page')})
         pages = len(pageLinks)
     for i in range(0, pages):
         if i > 0:
             source = self._fetch_url(self.BASE_URL + pageLinks[i]['href'])
             showIndex = BeautifulSoup(source)
         episodesListing = showIndex.find(
             'ul', {
                 'class': re.compile('scet_th_full')
             }).findAll('li')
         for episode in episodesListing:
             vidInfo = {'tvshowtitle': tvshowtitle, 'studio': 'NBC'}
             title = decode_htmlentities(
                 episode.find('p', {
                     'class': re.compile('list_full_det_title')
                 }).find('a').string)
             listitem = xbmcgui.ListItem(title)
             listitem.setThumbnailImage(episode.find('img')['src'])
             episodeLink = episode.find('a')
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL, '')
             if episode.find('p', {'class': re.compile('list_full_des')}):
                 vidInfo['plot'] = decode_htmlentities(
                     episode.find('p', {
                         'class': re.compile('list_full_des')
                     }).find('em').string)
             epNum = re.search(
                 '^Ep(?:\.\s*)?([0-9]{1,2})([0-9][0-9])(?:\s*:\s*)?(.+)$',
                 title)
             if epNum != None:
                 vidInfo['season'] = int(epNum.group(1))
                 vidInfo['episode'] = int(epNum.group(2))
             vidInfo['title'] = epNum.group(3)
             #airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'})))
             #seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'})))
             #episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'})))
             #vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2))
             #vidInfo['season'] = int(seasonNum.group(1))
             #vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             #vidInfo['duration'] = episodeNumAndDuration.group(2)
             #vidInfo['title'] = episode.find('h3').find('a').string
             #vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4))
             #print vidInfo
             listitem.setInfo("video", vidInfo)
             xbmcplugin.addDirectoryItem(
                 handle=int(sys.argv[1]),
                 listitem=listitem,
                 url="%s?episodeUrl=%s&episode=%s&season=%s" %
                 (sys.argv[0], quote_plus(episodeUrl), vidInfo['episode'],
                  vidInfo['season']))
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_EPISODE)
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_DATE)
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_LABEL)
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_DURATION)
     xbmcplugin.setContent(handle=int(sys.argv[1]), content='episodes')
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Example #51

0

Show file

class ProntuarioVeiculo(object):
    '''
    classdocs
    '''
    def __init__(self, prontuario_html):
        '''
        Constructor
        '''

        self.__soup = \
            BeautifulSoup(prontuario_html, fromEncoding='iso-8859-1',
                          convertEntities=BeautifulSoup.HTML_ENTITIES)
        self.__prontuario = {}

        self._parsear_dados_veiculo()
        self._parsear_debitos()
        self._parsear_infracoes_em_autuacao()
        self._parsear_listagem_multas()
        self._parsear_historico_multas()
        self._parsear_ultimo_processo()
        self._parsear_recurso_infracao()

    def obter_prontuario(self):
        return deepcopy(self.__prontuario)

    def _parsear_dados_veiculo(self):
        tabela = self.__soup.find('div', id='div_servicos_02').table.tbody

        for celula in tabela.findAll('td'):
            dado = celula.findAll(text=True)
            if len(dado) == 2:
                self.__prontuario[dado[0].strip()] = dado[1].strip()

    def _parsear_debitos(self):
        tabela = self.__soup.find('div', id='div_servicos_03').table.tbody

        debitos = []
        for linha in tabela.findAll('tr')[1:-1]:
            debito = {}

            texto = linha.td.findAll(text=True)
            if texto == None:
                texto = ''
            debito[u'Classe'] = ''.join(texto).strip()

            link = ''
            if linha.td.a != None:
                link = linha.td.a['href'].strip()
            debito[u'Link'] = link

            celulas = [
                u'Número DetranNet', u'Vencimento', u'Valor Nominal(R$)',
                u'Multa(R$)', u'Juros(R$)', u'Valor Atual(R$)'
            ]
            for celula, valor in zip(celulas, linha.findAll('td')[1:]):
                debito[celula] = valor.string.strip()

            debitos.append(debito)

        self.__prontuario[u'Débitos'] = debitos

    def _parsear_infracoes_em_autuacao(self):
        tabela = self.__soup.find('div', id='div_servicos_10').table.tbody

        celula_filha = lambda tag: tag.name == 'td' and tag.table == None
        celulas = tabela.findAll(celula_filha)[3:]

        infracoes = []
        for i in range(len(celulas) / 7):
            linha = 7 * i

            infracao = {}
            infracao[u'Número'] = celulas[linha].a.string
            infracao[u'Link'] = celulas[linha].a['href'].strip()
            infracao[u'Valor'] = celulas[linha + 1].string.strip()
            infracao[u'Situação'] = celulas[linha + 2].string.strip()
            infracao[u'Descrição 1'] = celulas[linha + 3].string.strip()
            infracao[u'Descrição 2'] = celulas[linha + 4].string.strip()
            infracao[u'Local/Complemento 1'] = celulas[linha +
                                                       5].string.strip()
            infracao[u'Local/Complemento 2'] = celulas[linha + 6].string
            if infracao[u'Local/Complemento 2'] == None:
                infracao[u'Local/Complemento 2'] = u''
            infracao[u'Local/Complemento 2'] = infracao[
                u'Local/Complemento 2'].strip()

            infracoes.append(infracao)

        self.__prontuario[u'Infrações em Autuação'] = infracoes

    #TODO: Implementar
    def _parsear_listagem_multas(self):
        tabela = self.__soup.find('div', id='div_servicos_04').table.tbody

        if tabela.tr.td.find(text=re.compile(u'Nenhuma?')):
            self.__prontuario[u'Listagem de Multas'] = []
            return

    def _parsear_historico_multas(self):
        tabela = self.__soup.find('div', id='div_servicos_07').table.tbody

        celula_filha = lambda tag: tag.name == 'td' and tag.table == None
        celulas = tabela.findAll(celula_filha)[3:]

        multas = []
        for i in range(len(celulas) / 7):
            linha = 7 * i

            multa = {}
            multa[u'Número'] = celulas[linha].a.string
            multa[u'Link'] = celulas[linha].a['href'].strip()
            multa[u'Lançamento'] = celulas[linha + 1].string.strip()
            multa[u'Pagamento'] = celulas[linha + 2].string.strip()
            multa[u'Descrição 1'] = celulas[linha + 3].string.strip()
            multa[u'Descrição 2'] = celulas[linha + 4].string.strip()
            multa[u'Local/Complemento 1'] = celulas[linha + 5].string.strip()
            multa[u'Local/Complemento 2'] = celulas[linha + 6].string
            if multa[u'Local/Complemento 2'] == None:
                multa[u'Local/Complemento 2'] = u''
            multa[u'Local/Complemento 2'] = multa[
                u'Local/Complemento 2'].strip()

            multas.append(multa)

        self.__prontuario[u'Histórico de Multas'] = multas

    def _parsear_ultimo_processo(self):
        tabela = self.__soup.find('div', id='div_servicos_11').table.tbody

        ultimo_processo = {}
        celulas = tabela.findAll('td')
        for celula in celulas[:5]:
            dado = celula.findAll(text=True)
            ultimo_processo[dado[0]] = dado[1]
        for i in range(7, len(celulas), 2):
            chave = celulas[i].findAll(text=True)[0]
            valor = celulas[i + 1].findAll(text=True)[0]
            ultimo_processo[chave] = valor

        self.__prontuario[u'Último Processo'] = ultimo_processo

    #TODO: Implementar
    def _parsear_recurso_infracao(self):
        tabela = self.__soup.find('div', id='div_servicos_09').table.tbody

        if tabela.tr.td.find(text=re.compile(u'Nenhuma?')):
            self.__prontuario[u'Recurso de Infração'] = []
            return

Example #52

0

Show file

File: rbs_scraper.py Project: JamieMcNaught/Bank-Scraper

    def _processNormAccount(self, raw, account_path, balance):

        soup = BeautifulSoup(raw)

        logging.debug('Norm ac path - ' + str(account_path) + ' - end' )

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug('Processing :) norm ' )

                builder = StatementBuilder(self.facade, account_path, self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[self.current_statement]

                # we know this is not a credit card
                isCCard = False

                # get a fixed balance somewhere??
                # balance passed in for RBS

                # set up our statement
                self.statementbuilder.make_recent_dif_statement('RBS-recent', 'Scraper', None)

                

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)


                # now find all the recent transactions
                x_table = soup.find('table', attrs={'class' : 'ItemTable'})

                if x_table == None:
                    # could easily be no transactions
                    logging.debug(" No xtable ======>")


                if x_table != None:
                    x_body = x_table.find('tbody')
                    inputs = x_body.findAll('tr')

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll('td')
                        if vals:
                            cash = ''
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:
                                    logging.debug("date ======> " + data)

                                    dp = DateParser()
                                    try:
                                        atts['date'] = dp.ymd_from_date(dp.date_from_small(data))
                                    except:
                                        atts['date'] == ''
                                if i == 1:
                                    if data == 'ATM':
                                        cash = 'CASH - '

                                if i == 2:
                                    if data != "":
                                        extra = ""
                                        datebit = ""
                                        parts = data.split(',')
                                        if len(parts) > 1:
                                            # match RBS dates - a.la. 8062 14APR11
                                            if re.match('\d{4}\s\d\d[A-Z]{3}\d\d', parts[0]) != None:
                                                datebit = parts[0][0:4] + ' ' + parts[0][5:7] + ' ' + parts[0][7:10]
                                                # remember pretty_display strips out any words containing a sequence of 3 or more numbers

                                                parts = parts[1:]

                                        if len(parts) > 1:
                                            extra = parts[-1]
                                            parts = parts[0:-1]

                                        data = ' '.join(parts)

                                        disp =  (cash + data).strip()

                                        atts['display'] = " ".join(disp.split())

                                        atts['extradisplay'] = " ".join( (extra + " " + datebit).split())

                                if i > 2:   # the numbers

                                    if data != "" and data != '-':
                                        logging.debug('->' + data + '<-')
                                        amount = self.normalise_ammount(data)

                                        if i == 3:
                                            atts['amount'] = amount
                                            atts['type'] = 'Credit'

                                        if i == 4:
                                            atts['amount'] = amount
                                            atts['type'] = 'Debit'

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)

            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception('RBS parsing error - ' + str(e))

Example #53

0

Show file

File: first_direct_scraper.py Project: JamieMcNaught/Bank-Scraper

    def processAccount(self, acCount, acName, account_path, allofit):
        
        page = self.HexToByte( allofit['body'])
        
        # save this page
        self.output_page("account" + str(acCount) + ".html", page) 
        
        soup = BeautifulSoup(page)
            
        logging.debug('ac path - ' + str(account_path) + ' - end' )
        
        if account_path != "":
            # delete existing current xactions
            
            logging.debug('Processing :) ' )
            
            self.statementbuilder = StatementBuilder(self.facade, account_path, self.token)
           
            # need to get last statement and make a new one every time
            self.statementbuilder.make_recent_dif_statement('Fd-recent', 'Scraper', None) #TODO change this 
                        
            isVisa = False
            loginform=soup.find('input', attrs={'name' : 'cmd_sort_referenceAscending'})
            if loginform != None:
                isVisa = True
                
                bal_tables=soup.findAll('table', attrs={'class' : 'fdTableBackgroundOne'})
                balance_table = bal_tables[2]

                if balance_table <> None:
                    vals = balance_table.findAll('td')

                    if vals:
                        bal = vals[1].text
                        data = bal.replace('&#163;', u'£');
                        data = data.strip(u'£')
                        if data[-1] == 'D':
                            data = data.replace('DB','')
                            data = data.replace('D','')
                            lastbal = int( float(data) * 100 )
                            firstbal = 0 - lastbal
                        else:
                            data = data.replace('CR','')
                            data = data.replace('C','')
                            firstbal = int( float(data) * 100 )
                        
                        self.statementbuilder.set_current_balance(firstbal)    
                   
            
            logging.debug("-----------------------------*******---------------------")
            if isVisa:
                logging.debug("found visa --")
            
            acTable=soup.find('table', attrs={'class' : 'fdStatTable'})
            
            # if no table then no new data afaik
            if acTable != None:
               datarows=acTable.findAll('tr')
               
               next = False
               
                
               # build the post values up
               atts = {}
               
               isFirst = True
               firstbal = 0
               firstdate = ""
               
               lastbal = 0
               lastdate = ""
               
               doBalance = False
               
               dp = DateParser()
                           
               for rows in datarows:
                   vals = rows.findAll('td')
                   
                   if vals:
                       for i, val in enumerate(vals):
                           
                           if val.text:
                               data = val.text.strip()
                               data = unescape(data)
                               data = unicode(data)
                               
                           else:
                               data = ""
                           
                           if data != "&nbsp;":
                               data = data.replace('&nbsp;','')
                               if i == 0:
                                   if data != "":
                                       try:
                                           lastdate = dp.ymd_from_date(dp.date_from_dmy(data,'/'))
                                       except:
                                           logging.warn("Invalid FD date format - probably no transactions")
                                           return
                                       
                                       if firstdate == "":
                                           firstdate = lastdate
                                       
                                   atts['date'] = lastdate
                                   
                               if (i == 1 and not isVisa) or (i == 2 and isVisa):
                                       atts['display'] = data[0:19]
                                       atts['extradisplay'] = data[19:]
                                   
                               if (i == 2 and not isVisa) or (i == 3 and isVisa):
                                   if data != "":
                                       data = data.strip(u'£')
                                       data = data.strip(u'D')
                                       data = data.strip(u'B')
                                       if data == '':
                                           atts['amount'] = 0
                                       else:
                                           atts['amount'] = int( float(data) * 100 )
                                       atts['type'] = 'Debit'
                                           
                               if (i == 3 and not isVisa) or (i == 4 and isVisa):
                                   if data != "":
                                       data = data.strip(u'£')
                                       data = data.strip(u'C')
                                       data = data.strip(u'R')
                                       if data == '':
                                           atts['amount'] = 0
                                       else:
                                           atts['amount'] = int( float(data) * 100 )
                                       atts['type'] = 'Credit'
                                       
                               if not isVisa:
                                   if i == 4:
                                       data = data.strip(u'£')
                                       if data != "":
                                           lastbal = int( float(data) * 100 )
                                           
                                           if isFirst:
                                               isFirst = False
                                               firstbal = lastbal
                                               doBalance = True
                                               
                                   if i == 5:
                                       if doBalance:
                                           doBalance = False
                                           if data == "D":
                                               firstbal = 0 - firstbal
                                           self.statementbuilder.set_current_balance(firstbal) 
                                       
                       self.statementbuilder.make_xact(atts)
           
               self.statementbuilder.put_statement()
               self.current_statement = self.current_statement + 1

Example #54

0

Show file

File: rbs_scraper.py Project: JamieMcNaught/Bank-Scraper

    def _processCCAccount(self, raw, account_path, balance):
        soup = BeautifulSoup(raw)

        logging.debug('CC ac path - ' + str(account_path) + ' - end' )

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug('Processing :) ' )

                builder = StatementBuilder(self.facade, account_path, self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[self.current_statement]

                # we know this is not a credit card
                isCCard = True

                # get a fixed balance somewhere??
                # passed in for RBS

                # set up our statement
                self.statementbuilder.make_recent_dif_statement('RBS-recent', 'Scraper', None)

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)

                # now find all the recent transactions
                x_table = soup.find('table', attrs={'class' : 'ItemTable'})

                if x_table != None:
                    x_body = x_table.find('tbody')
                    inputs = x_body.findAll('tr')

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll('td')
                        if vals:
                            datebit = ''
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:
                                    logging.debug("date ======> " + data)

                                    dp = DateParser()
                                    try:
                                        atts['date'] = dp.ymd_from_date(dp.date_from_small(data))
                                    except:
                                        atts['date'] == ''

                                if i == 1:
                                    datebit = data[:-5]

                                if i == 2:
                                    if data != 'SALE':           # only keep extra xact date for Sales
                                        datebit = ''

                                if i == 3:
                                    if data != "":
                                        atts['display'] =  " ".join(data.split())
                                        atts['extradisplay'] = datebit

                                if i > 3:   # the numbers

                                    if data != "" and data != '-':
                                        amount = self.normalise_ammount(data)

                                        if i == 4:
                                            atts['amount'] = amount
                                            atts['type'] = 'Credit'

                                        if i == 5:
                                            atts['amount'] = amount
                                            atts['type'] = 'Debit'

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)


            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception('RBS parsing error - ' + str(e))

Example #55

0

Show file

    def _processNormAccount(self, raw, account_path, balance):

        soup = BeautifulSoup(raw)

        logging.debug('Norm ac path - ' + str(account_path) + ' - end')

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug('Processing :) norm ')

                builder = StatementBuilder(self.facade, account_path,
                                           self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[
                    self.current_statement]

                # we know this is not a credit card
                isCCard = False

                # get a fixed balance somewhere??
                # balance passed in for natwest

                # set up our statement
                self.statementbuilder.make_recent_dif_statement(
                    'NatWest-recent', 'Scraper', None)

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)

                # now find all the recent transactions
                x_table = soup.find('table', attrs={'class': 'ItemTable'})

                if x_table == None:
                    # could easily be no transactions
                    logging.debug(" No xtable ======>")

                if x_table != None:
                    x_body = x_table.find('tbody')
                    inputs = x_body.findAll('tr')

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll('td')
                        if vals:
                            cash = ''
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:

                                    dp = DateParser()
                                    try:
                                        atts['date'] = dp.ymd_from_date(
                                            dp.date_from_small(data))
                                    except:
                                        atts['date'] == ''
                                if i == 1:
                                    if data == 'ATM':
                                        cash = 'CASH - '

                                if i == 2:
                                    if data != "":
                                        extra = ""
                                        datebit = ""
                                        parts = data.split(',')
                                        if len(parts) > 1:
                                            # match natwest dates - a.la. 8062 14APR11
                                            if re.match(
                                                    '\d{4}\s\d\d[A-Z]{3}\d\d',
                                                    parts[0]) != None:
                                                datebit = parts[0][
                                                    0:4] + ' ' + parts[0][
                                                        5:7] + ' ' + parts[0][
                                                            7:10]
                                                # remember pretty_display strips out any words containing a sequence of 3 or more numbers

                                                parts = parts[1:]

                                        if len(parts) > 1:
                                            extra = parts[-1]
                                            parts = parts[0:-1]

                                        data = ' '.join(parts)

                                        disp = (cash + data).strip()

                                        atts['display'] = " ".join(
                                            disp.split())

                                        atts['extradisplay'] = " ".join(
                                            (extra + " " + datebit).split())

                                if i > 2:  # the numbers

                                    if data != "" and data != '-':

                                        amount = self.normalise_ammount(data)

                                        if i == 3:
                                            atts['amount'] = amount
                                            atts['type'] = 'Credit'

                                        if i == 4:
                                            atts['amount'] = amount
                                            atts['type'] = 'Debit'

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)

            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception('NatWest parsing error - ' + str(e))

Example #56

0

Show file

    def processAccount(self, acCount, acName, account_path, allofit):

        page = self.HexToByte(allofit['body'])

        # save this page
        self.output_page("account" + str(acCount) + ".html", page)

        soup = BeautifulSoup(page)

        logging.debug('ac path - ' + str(account_path) + ' - end')

        if account_path != "":
            # delete existing current xactions

            logging.debug('Processing :) ')

            self.statementbuilder = StatementBuilder(self.facade, account_path,
                                                     self.token)

            # need to get last statement and make a new one every time
            self.statementbuilder.make_recent_dif_statement(
                'Fd-recent', 'Scraper', None)  #TODO change this

            isVisa = False
            loginform = soup.find(
                'input', attrs={'name': 'cmd_sort_referenceAscending'})
            if loginform != None:
                isVisa = True

                bal_tables = soup.findAll(
                    'table', attrs={'class': 'fdTableBackgroundOne'})
                balance_table = bal_tables[2]

                if balance_table <> None:
                    vals = balance_table.findAll('td')

                    if vals:
                        bal = vals[1].text
                        data = bal.replace('&#163;', u'£')
                        data = data.strip(u'£')
                        if data[-1] == 'D':
                            data = data.replace('DB', '')
                            data = data.replace('D', '')
                            lastbal = int(float(data) * 100)
                            firstbal = 0 - lastbal
                        else:
                            data = data.replace('CR', '')
                            data = data.replace('C', '')
                            firstbal = int(float(data) * 100)

                        self.statementbuilder.set_current_balance(firstbal)

            logging.debug(
                "-----------------------------*******---------------------")
            if isVisa:
                logging.debug("found visa --")

            acTable = soup.find('table', attrs={'class': 'fdStatTable'})

            # if no table then no new data afaik
            if acTable != None:
                datarows = acTable.findAll('tr')

                next = False

                # build the post values up
                atts = {}

                isFirst = True
                firstbal = 0
                firstdate = ""

                lastbal = 0
                lastdate = ""

                doBalance = False

                dp = DateParser()

                for rows in datarows:
                    vals = rows.findAll('td')

                    if vals:
                        for i, val in enumerate(vals):

                            if val.text:
                                data = val.text.strip()
                                data = unescape(data)
                                data = unicode(data)

                            else:
                                data = ""

                            if data != "&nbsp;":
                                data = data.replace('&nbsp;', '')
                                if i == 0:
                                    if data != "":
                                        try:
                                            lastdate = dp.ymd_from_date(
                                                dp.date_from_dmy(data, '/'))
                                        except:
                                            logging.warn(
                                                "Invalid FD date format - probably no transactions"
                                            )
                                            return

                                        if firstdate == "":
                                            firstdate = lastdate

                                    atts['date'] = lastdate

                                if (i == 1 and not isVisa) or (i == 2
                                                               and isVisa):
                                    atts['display'] = data[0:19]
                                    atts['extradisplay'] = data[19:]

                                if (i == 2 and not isVisa) or (i == 3
                                                               and isVisa):
                                    if data != "":
                                        data = data.strip(u'£')
                                        data = data.strip(u'D')
                                        data = data.strip(u'B')
                                        if data == '':
                                            atts['amount'] = 0
                                        else:
                                            atts['amount'] = int(
                                                float(data) * 100)
                                        atts['type'] = 'Debit'

                                if (i == 3 and not isVisa) or (i == 4
                                                               and isVisa):
                                    if data != "":
                                        data = data.strip(u'£')
                                        data = data.strip(u'C')
                                        data = data.strip(u'R')
                                        if data == '':
                                            atts['amount'] = 0
                                        else:
                                            atts['amount'] = int(
                                                float(data) * 100)
                                        atts['type'] = 'Credit'

                                if not isVisa:
                                    if i == 4:
                                        data = data.strip(u'£')
                                        if data != "":
                                            lastbal = int(float(data) * 100)

                                            if isFirst:
                                                isFirst = False
                                                firstbal = lastbal
                                                doBalance = True

                                    if i == 5:
                                        if doBalance:
                                            doBalance = False
                                            if data == "D":
                                                firstbal = 0 - firstbal
                                            self.statementbuilder.set_current_balance(
                                                firstbal)

                        self.statementbuilder.make_xact(atts)

                self.statementbuilder.put_statement()
                self.current_statement = self.current_statement + 1

Example #57

0

Show file

###################################################
# RMIT University, Melbourne
# Date 27 Mar 2012
# By Emil Broegger Kjer
# For questions or comments contact [email protected]
###################################################

from BeautifulSoup.BeautifulSoup import BeautifulSoup
import urllib2, re

#### Read from URL
page = urllib2.urlopen(
    "http://tt.metlinkmelbourne.com.au/tt/XSLT_TTB_REQUEST?command=direct&language=en&outputFormat=0&net=vic&line=02EPP&project=ttb&itdLPxx_selLineDir=R&sup=B"
)
soup = BeautifulSoup(page)

#### Read from file
# transport_line = "epping_line"
# weekday = "weekday"
# direction = "true"
# filestr = ('data/timetables/%s_%s_%s.html' % (transport_line, weekday, direction))
# fil = open(filestr, "r")
# soup = BeautifulSoup(fil.read(), fromEncoding='utf8')
# fil.close()

divs = soup.html.body.findAll('div')
children = divs[0].contents

#### Set the timetable
tt = children[1].contents[3].contents[3].contents[3].contents[1].contents[2]