Python BeautifulSoup.find Beispiele, BeautifulSoup.BeautifulSoup.BeautifulSoup.find Python Beispiele

Beispiel #1

0

Datei anzeigen

    def _parseComment(self, communityId, liveInfoFilePath, commentFilePath):
        chatList = []
        if not (os.path.exists(liveInfoFilePath) and os.path.exists(commentFilePath)):
            return chatList

        infoParser = BeautifulSoup(open(liveInfoFilePath, u'r'))
        if not infoParser.find(u'communityid').renderContents() == communityId:
            return chatList

        commentParser = BeautifulSoup(open(commentFilePath, u'r'))
        chatTagList = commentParser.findAll(u'chat', attrs={u'msgkind': u'message_msg'})
        for chatTag in chatTagList:
            communityId = communityId.decode(u'utf-8')
            liveId = infoParser.find(u'liveid').renderContents().decode()
            userId = chatTag.get(u'user').decode(u'utf-8')
            name = chatTag.get(u'nickname').decode(u'utf-8')
            message = chatTag.renderContents().decode(u'utf-8')
            option = chatTag.get(u'mail').decode(u'utf-8') if chatTag.get(u'mail') != '' else None
            date = re.sub(
                ur'(\d{4})/(\d{1,2})/(\d{1,2})\s(\d{1,2}):(\d{1,2}):(\d{1,2})',
                lambda match: u'{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}'.format(int(match.group(1)), int(match.group(2)), int(match.group(3)), int(match.group(4)), int(match.group(5)), int(match.group(6))),
                chatTag.get(u'date')
            ).decode(u'utf-8')
            chatList.append((communityId, liveId, userId, name, message, option, date))

        return chatList

Beispiel #2

0

Datei anzeigen

Datei: seasonList.py Projekt: Armedite/xbmc-catchuptv-au

 def __init__( self ):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl']))
     print 'Loading ' + self.BASE_URL + unquote_plus(params['showUrl'])
     seasonIndex = BeautifulSoup(source)
     tvshowcontainer = seasonIndex.find('div',id=re.compile('scet_header|scet_top|show-header|show-header-scet|^header$'))
     if tvshowcontainer==None:
         tvshowcontainer=seasonIndex.find('div',{'class':re.compile('scet_header|scet_top|show-header|show-header-scet')})
     if tvshowcontainer!=None:
         tvshowtitle = tvshowcontainer.find('h1').string
     else:
         tvshowtitle = re.search('var siteName = "(.+?)";',source).group(1)
     print 'Parsing seasons for "%s"' % tvshowtitle
     showsListing = seasonIndex.find('div',{"class":re.compile('scet-gallery-nav')}).find('h3',text='Full Episodes').parent.findNextSibling('ul').findAll('li')
     for show in showsListing:
         showLink = show.find('a')
         print 'Found '+showLink.string
         listitem=xbmcgui.ListItem(decode_htmlentities(showLink.string))
         listitem.setInfo('video',{'tvshowtitle':tvshowtitle})
         #listitem.setThumbnailImage(showLink.find('img')['src'])
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL,'')
         xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?seasonUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl),),totalItems=len(showsListing),isFolder=True)
     xbmcplugin.setContent( handle=int(sys.argv[1]), content='seasons')
     xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )

Beispiel #3

0

Datei anzeigen

Datei: episodeList.py Projekt: Armedite/xbmc-catchuptv-au

 def __init__( self ):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL + unquote_plus(params['seasonUrl']))
     showIndex = BeautifulSoup(source)
     tvshowcontainer = showIndex.find('div',id=re.compile('scet_header|scet_top|show-header|show-header-scet|^header$'))
     if tvshowcontainer==None:
         tvshowcontainer=showIndex.find('div',{'class':re.compile('scet_header|scet_top|show-header|show-header-scet')})
     if tvshowcontainer!=None:
         tvshowtitle = tvshowcontainer.find('h1').string
     else:
         tvshowtitle = re.search('var siteName = "(.+?)";',source).group(1)
     pages = 1
     if showIndex.find('div',{'class':re.compile('nbcu_pager')}):
         pageLinks = showIndex.find('div',{'class':re.compile('nbcu_pager')}).findAll('a',{'class':re.compile('nbcu_pager_page')})
         pages = len(pageLinks)
     for i in range(0,pages):
         if i>0:
             source = self._fetch_url(self.BASE_URL + pageLinks[i]['href'])
             showIndex = BeautifulSoup(source)
         episodesListing = showIndex.find('ul',{'class':re.compile('scet_th_full')}).findAll('li')
         for episode in episodesListing:
             vidInfo = {'tvshowtitle': tvshowtitle, 'studio': 'NBC'}
             title = decode_htmlentities(episode.find('p',{'class':re.compile('list_full_det_title')}).find('a').string)
             listitem = xbmcgui.ListItem(title)
             listitem.setThumbnailImage(episode.find('img')['src'])
             episodeLink = episode.find('a')
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL,'')
             if episode.find('p',{'class':re.compile('list_full_des')}):
                 vidInfo['plot'] = decode_htmlentities(episode.find('p',{'class':re.compile('list_full_des')}).find('em').string)
             epNum = re.search('^Ep(?:\.\s*)?([0-9]{1,2})([0-9][0-9])(?:\s*:\s*)?(.+)$',title)
             if epNum != None:
                 vidInfo['season'] = int(epNum.group(1))
                 vidInfo['episode'] = int(epNum.group(2))
             vidInfo['title'] = epNum.group(3)
             #airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'})))
             #seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'})))
             #episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'})))
             #vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2))
             #vidInfo['season'] = int(seasonNum.group(1))
             #vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             #vidInfo['duration'] = episodeNumAndDuration.group(2)
             #vidInfo['title'] = episode.find('h3').find('a').string
             #vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4))
             #print vidInfo
             listitem.setInfo("video",vidInfo)
             xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?episodeUrl=%s&episode=%s&season=%s" % ( sys.argv[ 0 ], quote_plus(episodeUrl),vidInfo['episode'],vidInfo['season']))
     xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_EPISODE )
     xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_DATE )
     xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_LABEL )
     xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_DURATION )
     xbmcplugin.setContent( handle=int(sys.argv[1]), content='episodes')
     xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )

Beispiel #4

0

Datei anzeigen

    def doStep3(self, allofit, page):

        scrape_result = "good"

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("natwest-security.html", page)

        scrape_result = 'good'
        logging.info("NatWest security page2")

        # check if we got returned
        # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on
        errorDiv = soup.findAll(
            'input', attrs={'name': 'ctl00$mainContent$LI5TABA$DBID_edit'})

        if len(errorDiv) != 0:
            logging.info("NatWest security page1 still - customer number bad")
            return 'credentials incorrect'  # if we get here then the form was found hence creds must be wrong

        # find our form
        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no security form')
            return 'bank error'

        values = self.parseForm(loginform)

        # define some variables that would only otherwise exist in a try catch block scope
        # the label text split on spaces
        which1arr = ""
        which2arr = ""
        which3arr = ""

        # the chalenges
        firstDigit = ""
        secondDigit = ""
        thirdDigit = ""

        #>>>>>>> The first set of Pin fields
        #-------------------- get the questions --------------#

        #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label>
        useNewTab = False
        try:
            which1 = soup.find('label',
                               attrs={
                                   'for': 'ctl00_mainContent_LI6PPEA_edit'
                               }).text
        except Exception, e:
            useNewTab = True

Beispiel #5

0

Datei anzeigen

Datei: rbs_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def doStep3(self, allofit, page):
        
        scrape_result = "good"
        
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("RBS-security.html", page)
        
        scrape_result = 'good'
        logging.info("RBS security page2")

        # check if we got returned
        # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on
        errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI5TABA$DBID_edit'})

        if len(errorDiv) != 0:
            logging.info("RBS security page1 still - customer number bad")
            return  'credentials incorrect'   # if we get here then the form was found hence creds must be wrong
        
        
        # find our form
        loginform=soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('RBS no security form')
            return 'bank error'

        values = self.parseForm(loginform)

        # define some variables that would only otherwise exist in a try catch block scope
        # the label text split on spaces
        which1arr = ""
        which2arr = ""
        which3arr = ""

        # the chalenges
        firstDigit  = ""
        secondDigit = ""
        thirdDigit  = ""

        #>>>>>>> The first set of Pin fields
        #-------------------- get the questions --------------#

        #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label>
        useNewTab = False
        try:
            which1=soup.find('label', attrs={'for' : 'ctl00_mainContent_LI6PPEA_edit'}).text
        except Exception, e:
            useNewTab = True

Beispiel #6

0

Datei anzeigen

    def doStep7(self, page):

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-acclink.html", page)

        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no view account form')
            return 'bank error'

        action = self.urlBase + '/' + loginform['action']

        values = self.parseForm(loginform)

        # fill in our selection - 1 month
        values['ctl00$mainContent$SS2SPDDA'] = 'M1'

        # default button - needed
        values['ctl00$mainContent$NextButton_button'] = 'View Transactions'

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 20

        return 'good'

Beispiel #7

0

Datei anzeigen

    def doStep1(self, allofit, page):

        body = page

        scrape_result = 'good'
        logging.info("NatWest page1")

        # the following is how you could retrieve the headers from the request
        # for head in allofit['headers']:
        #     name = self.HexToByte(head['name'])
        #     val = self.HexToByte(head['value'])

        # write out the start page
        self.output_page("1_first_page.html", body)

        soup = BeautifulSoup(body)

        frame = soup.find('frame', id='ctl00_secframe')

        if frame != None:

            action = self.urlBase + '/' + frame['src']

            #<frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&amp;cookieid=100714"></frame>

            self.response = {}
            self.response['url'] = self.ByteToHex(action)
            self.response['data'] = ""
            self.response['method'] = 'GET'
            self.response['step'] = 2
        else:
            logging.debug('NatWest frame link error - ')
            scrape_result = 'bank error'

        return scrape_result

Beispiel #8

0

Datei anzeigen

Datei: rbs_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def doStep2(self, allofit, page):
       
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)
        
        self.output_page("RBS-username.html", page)
    
        loginform=soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('RBS no login form')
            return 'bank error'


        action = self.urlBase + '/' + loginform['action']
        
        values = self.parseForm(loginform)
        
        # fill in our credentials
        values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds['01']   #customer number

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 3
        
        return 'good'

Beispiel #9

0

Datei anzeigen

Datei: episodeList.py Projekt: Armedite/xbmc-catchuptv-au

 def __init__( self ):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl']))
     showIndex = BeautifulSoup(source)
     vidInfo = {'tvshowtitle': showIndex.find('div',id='showDashboard').find('span',{'class':'blueText'}).string, 'studio': 'FOX'}
     seasonsListing = showIndex.findAll('div',{'class':re.compile('dashPageHolder'),'id':re.compile('^fullEp')})
     print len(seasonsListing)
     for season in seasonsListing:
         episodesListing = season.findAll('div',{'class':'episodeListing'})
         for episode in episodesListing:
             listitem = xbmcgui.ListItem(episode.find('h3').find('a').string)
             listitem.setThumbnailImage(episode.find('img',id=re.compile('^epThumb'))['src'])
             episodeLink = episode.find('a',{'class':'thumbnailLink'})
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL,'')
             airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'})))
             seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'})))
             episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'})))
             vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2))
             vidInfo['season'] = int(seasonNum.group(1))
             vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             vidInfo['duration'] = episodeNumAndDuration.group(2)
             vidInfo['title'] = episode.find('h3').find('a').string
             vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4))
             print vidInfo
             listitem.setInfo("video",vidInfo)
             xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?episodeUrl=%s" % ( sys.argv[ 0 ], quote_plus(episodeUrl)))
     xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )

Beispiel #10

0

Datei anzeigen

Datei: rbs_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def doStep6(self, page):
        
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)
        
        self.output_page("RBS-acclink.html", page)
    
        loginform=soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('RBS no view account form')
            return 'bank error'


        action = self.urlBase + '/' + loginform['action']
        
        values = self.parseForm(loginform)
        
        # fill in our selection - 1 month
        values['ctl00$mainContent$SS2SPDDA'] = 'M1'

        # default button - needed
        values['ctl00$mainContent$NextButton_button'] = 'View Transactions'

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 20
        

        return 'good'

Beispiel #11

0

Datei anzeigen

Datei: natwest_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def doStep2(self, allofit, page):

        # -------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-username.html", page)

        loginform = soup.find("form", attrs={"name": "aspnetForm"})

        if loginform == None:
            logging.debug("NatWest no login form")
            return "bank error"

        action = self.urlBase + "/" + loginform["action"]

        values = self.parseForm(loginform)

        # fill in our credentials
        values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds["01"]  # customer number

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response["url"] = self.ByteToHex(action)
        self.response["data"] = self.ByteToHex(data)
        self.response["method"] = "POST"
        self.response["step"] = 3

        return "good"

Beispiel #12

0

Datei anzeigen

    def doStep2(self, allofit, page):

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-username.html", page)

        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no login form')
            return 'bank error'

        action = self.urlBase + '/' + loginform['action']

        values = self.parseForm(loginform)

        # fill in our credentials
        values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds[
            '01']  #customer number

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 3

        return 'good'

Beispiel #13

0

Datei anzeigen

Datei: api.py Projekt: saketkc/Google-AppEngine-Projects

    def get(self):
        dates = self.request.get('date')
        froms = self.request.get("from")
        to = self.request.get("to")
        #froms= "BOM"
        #to = "CBJ"
        resp=urlfetch.fetch("http://www.cleartrip.com/flights/results?from="+froms+"&to="+to+"&depart_date="+dates+"&adults=1&childs=0&infants=0&dep_time=0&class=Economy&airline=&carrier=&x=57&y=16&flexi_search=no&tb=n")
        soup = BS(resp.content)
        my_content = soup.find("script",{"id":"json"})
        string_resp = str(my_content).strip()
        #self.response.out.write(str(string_resp))
        resp_splitted = string_resp.split(';')
        #self.response.out.write(str(resp_splitted))
        self.response.headers['Content-Type'] = 'text/html'
        self.response.out.write('<html><body><table>')
        a = 2-len(resp_splitted)
        
        
        
        #self.response.out.write(string_resp)
        #query2 = resp_splitted[-10].split('=')
        #self.response.out.write(query2[1])
        """content = eval(query2[1])
        self.response.out.write('<tr><td>Price</td>')
        self.response.out.write('<td>'+content['pr']+'</td></tr>')
        legs = content['legs']
        i = 0
        for leg in legs:
			self.response.out.write('<tr><td>Way '+str(i)+':</td></tr>')
			self.response.out.write('<td>'+leg['fr'] + "to "+ leg['to'] +'</td>')
			self.response.out.write('<tr><td>Arrival '+str(i)+':</td></tr>')
			self.response.out.write ('<td>'+leg['a']+'</td>')
			self.response.out.write('<tr><td>Departure '+str(i)+':</td></tr>')
			self.response.out.write ('<td>'+leg['dp']+'</td>')
			i+=1"""
	
        
        for query in range(a,-9):
			query2 = resp_splitted[query].strip().split('=')
			
			
			try:
				content = eval(query2[1])
				self.response.out.write("<tr><td>******************</td></tr>")
				self.response.out.write('<tr><td>Price</td>')
				self.response.out.write('<td>'+str(content.get('pr'))+'</td></tr>')
				legs = content.get('legs')
				i = 0
				for leg in legs:
					i+=1
					self.response.out.write('<tr><td>Way '+str(i)+':</td>')
					self.response.out.write('<td>'+leg.get('fr') + " => "+ leg['to'] +'</td></tr>')
					self.response.out.write('<tr><td>Arrival '+str(i)+':</td>')
					self.response.out.write ('<td>'+str(leg.get('a'))+'</td></tr>')
					self.response.out.write('<tr><td>Departure '+str(i)+':</td>')
					self.response.out.write ('<td>'+str(leg.get('dp'))+'</td></tr>')
					
				
			except:
				pass

Beispiel #14

0

Datei anzeigen

Datei: first_direct_scraper.py Projekt: JamieMcNaught/Bank-Scraper

 def DoStep3(self, allofit):
     
     scrape_result = "good"
     
     page = self.HexToByte( allofit['body'])
        
     #-------------------------------- Grab the form values -----------------------------------------------
     soup = BeautifulSoup(page)
     
     self.output_page("fd-summary.html", page)
     
     accountTable=soup.find('table', attrs={'class' : 'fdBalancesTable'})
     
     if accountTable != None:
         self.accountLinks=accountTable.findAll('a', attrs={'class' : 'fdActionLink'})
         
         if len(self.accountLinks) == 0:
             #got some kind of message
             scrape_result = 'bank error'
             logging.info('Still got no accounts')
     else:
         logging.debug("No fd table");
         scrape_result = 'credentials incorrect'
         
     return scrape_result

Beispiel #15

0

Datei anzeigen

Datei: first_direct_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def  firstPass(self, page):
        soup = BeautifulSoup(page)
        
        loginform=soup.find('form')
        
        action = loginform['action']
        
        urls = urlparse(action);
        self.urlBase = "https://" + urls.netloc
        logging.info("Base URL = " + self.urlBase)
        
        inputs = loginform.findAllNext('input')
        
        values = {}
        
        values['userid'] = self.filledCreds['03']  #username
        
        # build the body content
        data = urllib.urlencode(values)

        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 2

Beispiel #16

0

Datei anzeigen

    def firstPass(self, page):
        soup = BeautifulSoup(page)

        loginform = soup.find('form')

        action = loginform['action']

        urls = urlparse(action)
        self.urlBase = "https://" + urls.netloc
        logging.info("Base URL = " + self.urlBase)

        inputs = loginform.findAllNext('input')

        values = {}

        values['userid'] = self.filledCreds['03']  #username

        # build the body content
        data = urllib.urlencode(values)

        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 2

Beispiel #17

0

Datei anzeigen

    def DoStep3(self, allofit):

        scrape_result = "good"

        page = self.HexToByte(allofit['body'])

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("fd-summary.html", page)

        accountTable = soup.find('table', attrs={'class': 'fdBalancesTable'})

        if accountTable != None:
            self.accountLinks = accountTable.findAll(
                'a', attrs={'class': 'fdActionLink'})

            if len(self.accountLinks) == 0:
                #got some kind of message
                scrape_result = 'bank error'
                logging.info('Still got no accounts')
        else:
            logging.debug("No fd table")
            scrape_result = 'credentials incorrect'

        return scrape_result

Beispiel #18

0

Datei anzeigen

Datei: natwest_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def doStep1(self, allofit, page):

        body = page

        scrape_result = "good"
        logging.info("NatWest page1")

        # the following is how you could retrieve the headers from the request
        # for head in allofit['headers']:
        #     name = self.HexToByte(head['name'])
        #     val = self.HexToByte(head['value'])

        # write out the start page
        self.output_page("1_first_page.html", body)

        soup = BeautifulSoup(body)

        frame = soup.find("frame", id="ctl00_secframe")

        if frame != None:

            action = self.urlBase + "/" + frame["src"]

            # <frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&amp;cookieid=100714"></frame>

            self.response = {}
            self.response["url"] = self.ByteToHex(action)
            self.response["data"] = ""
            self.response["method"] = "GET"
            self.response["step"] = 2
        else:
            logging.debug("NatWest frame link error - ")
            scrape_result = "bank error"

        return scrape_result

Beispiel #19

0

Datei anzeigen

Datei: natwest_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def doStep7(self, page):

        # -------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-acclink.html", page)

        loginform = soup.find("form", attrs={"name": "aspnetForm"})

        if loginform == None:
            logging.debug("NatWest no view account form")
            return "bank error"

        action = self.urlBase + "/" + loginform["action"]

        values = self.parseForm(loginform)

        # fill in our selection - 1 month
        values["ctl00$mainContent$SS2SPDDA"] = "M1"

        # default button - needed
        values["ctl00$mainContent$NextButton_button"] = "View Transactions"

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response["url"] = self.ByteToHex(action)
        self.response["data"] = self.ByteToHex(data)
        self.response["method"] = "POST"
        self.response["step"] = 20

        return "good"

Beispiel #20

0

Datei anzeigen

Datei: seasonList.py Projekt: andy13th/xbmc-catchuptv-au

 def __init__(self):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL +
                              unquote_plus(params['showUrl']))
     print 'Loading ' + self.BASE_URL + unquote_plus(params['showUrl'])
     seasonIndex = BeautifulSoup(source)
     tvshowcontainer = seasonIndex.find(
         'div',
         id=re.compile(
             'scet_header|scet_top|show-header|show-header-scet|^header$'))
     if tvshowcontainer == None:
         tvshowcontainer = seasonIndex.find(
             'div', {
                 'class':
                 re.compile(
                     'scet_header|scet_top|show-header|show-header-scet')
             })
     if tvshowcontainer != None:
         tvshowtitle = tvshowcontainer.find('h1').string
     else:
         tvshowtitle = re.search('var siteName = "(.+?)";', source).group(1)
     print 'Parsing seasons for "%s"' % tvshowtitle
     showsListing = seasonIndex.find('div', {
         "class": re.compile('scet-gallery-nav')
     }).find(
         'h3',
         text='Full Episodes').parent.findNextSibling('ul').findAll('li')
     for show in showsListing:
         showLink = show.find('a')
         print 'Found ' + showLink.string
         listitem = xbmcgui.ListItem(decode_htmlentities(showLink.string))
         listitem.setInfo('video', {'tvshowtitle': tvshowtitle})
         #listitem.setThumbnailImage(showLink.find('img')['src'])
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL, '')
         xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]),
                                     listitem=listitem,
                                     url="%s?seasonUrl=%s" % (
                                         sys.argv[0],
                                         quote_plus(showUrl),
                                     ),
                                     totalItems=len(showsListing),
                                     isFolder=True)
     xbmcplugin.setContent(handle=int(sys.argv[1]), content='seasons')
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Beispiel #21

0

Datei anzeigen

 def __init__(self):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL +
                              unquote_plus(params['showUrl']))
     showIndex = BeautifulSoup(source)
     vidInfo = {
         'tvshowtitle':
         showIndex.find('div',
                        id='showDashboard').find('span', {
                            'class': 'blueText'
                        }).string,
         'studio':
         'FOX'
     }
     seasonsListing = showIndex.findAll(
         'div', {
             'class': re.compile('dashPageHolder'),
             'id': re.compile('^fullEp')
         })
     print len(seasonsListing)
     for season in seasonsListing:
         episodesListing = season.findAll('div',
                                          {'class': 'episodeListing'})
         for episode in episodesListing:
             listitem = xbmcgui.ListItem(
                 episode.find('h3').find('a').string)
             listitem.setThumbnailImage(
                 episode.find('img', id=re.compile('^epThumb'))['src'])
             episodeLink = episode.find('a', {'class': 'thumbnailLink'})
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL, '')
             airedDateAndPlot = re.search(
                 'Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',
                 str(episode.find('div', {'class': 'episodeInfo'})))
             seasonNum = re.search(
                 'Season\s+([0-9]+?)[\s:]',
                 str(episode.find('p', {'class': 'seasonNum'})))
             episodeNumAndDuration = re.search(
                 'Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',
                 str(episode.find('p', {'class': 'episodeNumLine'})))
             vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),
                                              airedDateAndPlot.group(1),
                                              airedDateAndPlot.group(2))
             vidInfo['season'] = int(seasonNum.group(1))
             vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             vidInfo['duration'] = episodeNumAndDuration.group(2)
             vidInfo['title'] = episode.find('h3').find('a').string
             vidInfo['plot'] = decode_htmlentities(
                 airedDateAndPlot.group(4))
             print vidInfo
             listitem.setInfo("video", vidInfo)
             xbmcplugin.addDirectoryItem(
                 handle=int(sys.argv[1]),
                 listitem=listitem,
                 url="%s?episodeUrl=%s" %
                 (sys.argv[0], quote_plus(episodeUrl)))
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Beispiel #22

0

Datei anzeigen

Datei: showList.py Projekt: Armedite/xbmc-catchuptv-au

 def __init__( self ):
     source = self._fetch_url(self.BASE_FOD_URL)
     fodIndex = BeautifulSoup(source)
     showsListing = fodIndex.find('div',id='episodes-listing').findAll('li')
     for show in showsListing:
         showLink = show.find('a')
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL,'')
         xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=xbmcgui.ListItem(showLink.string),url="%s?showUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl)),totalItems=len(showsListing),isFolder=True)
     xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )

Beispiel #23

0

Datei anzeigen

    def _loadComment(self, communityId, userSettingFilePath, commentLogFolder):
        nameDict = self._loadUserSetting(communityId, userSettingFilePath)
        commentLogFileList = filter(lambda file: re.match(ur'ncvLog_lv\d+-{0}\.xml$'.format(communityId), file) , os.listdir(commentLogFolder))

        chatList = []
        for commentFile in commentLogFileList:
            parser = BeautifulSoup(open(os.path.join(commentLogFolder, commentFile), u'r'))
            liveId = u'lv' + parser.find(u'livenum').renderContents().decode(u'utf-8')
            chatTagList = parser.find(u'livecommentdataarray').findAll(u'chat', recursive=False)
            for chatTag in chatTagList:
                userId = chatTag.get(u'user_id')
                if chatTag.get(u'user_id') == u'':
                    continue

                name = nameDict.get(userId)
                message = chatTag.renderContents().decode(u'utf-8')
                option = chatTag.get(u'mail')
                unixtime = time.localtime(int(chatTag.get(u'date')))
                date = (datetime.datetime(*unixtime[:-3]).strftime(u'%Y-%m-%d %H:%M:%S') if unixtime else None).decode(u'utf-8')
                chatList.append((communityId, liveId, userId, name, message, option, date))

        return chatList

Beispiel #24

0

Datei anzeigen

    def DoStep2(self, allofit):

        page = self.HexToByte(allofit['body'])

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("fd-username.html", page)

        loginform = soup.find('form')

        action = loginform['action']

        inputs = loginform.findAllNext('input')

        values = {}

        self.response = {}

        # build the post values up - there arent any others afaik

        ps = loginform.findAllNext('p')

        numbers = ps[1].findAllNext('strong')

        #not enough lookup digits
        try:
            password = self.lookupdigit(numbers[0].text) + self.lookupdigit(
                numbers[1].text) + self.lookupdigit(numbers[2].text)
        except:
            logging.debug("credentials incorrect")
            return 'credentials incorrect'

        answer = self.filledCreds['06']

        values['password'] = password
        values['memorableAnswer'] = answer

        # build the body content
        data = urllib.urlencode(values)

        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 3

        return 'good'

Beispiel #25

0

Datei anzeigen

Datei: first_direct_scraper.py Projekt: JamieMcNaught/Bank-Scraper

 def DoStep2(self, allofit):
    
     page = self.HexToByte( allofit['body'])
        
     #-------------------------------- Grab the form values -----------------------------------------------
     soup = BeautifulSoup(page)
     
     self.output_page("fd-username.html", page)
 
     loginform=soup.find('form')
 
     action = loginform['action']
     
     inputs = loginform.findAllNext('input')
     
     values = {}
     
     self.response = {}
     
     # build the post values up - there arent any others afaik
             
     ps = loginform.findAllNext('p')
             
     numbers = ps[1].findAllNext('strong')
     
     #not enough lookup digits
     try:
         password =  self.lookupdigit(numbers[0].text) + self.lookupdigit(numbers[1].text) + self.lookupdigit(numbers[2].text) 
     except:
         logging.debug("credentials incorrect")
         return 'credentials incorrect'
     
     answer = self.filledCreds['06']
     
     values['password'] = password
     values['memorableAnswer'] = answer
     
     # build the body content
     data = urllib.urlencode(values)
     
     self.response['url'] = self.ByteToHex(action)
     self.response['data'] = self.ByteToHex(data)
     self.response['method'] = 'POST'
     self.response['step'] = 3
     
     return 'good'

Beispiel #26

0

Datei anzeigen

 def __init__(self):
     source = self._fetch_url(self.BASE_FOD_URL)
     fodIndex = BeautifulSoup(source)
     showsListing = fodIndex.find('div',
                                  id='episodes-listing').findAll('li')
     for show in showsListing:
         showLink = show.find('a')
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL, '')
         xbmcplugin.addDirectoryItem(
             handle=int(sys.argv[1]),
             listitem=xbmcgui.ListItem(showLink.string),
             url="%s?showUrl=%s" % (sys.argv[0], quote_plus(showUrl)),
             totalItems=len(showsListing),
             isFolder=True)
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Beispiel #27

0

Datei anzeigen

Datei: rbs_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def doStep1(self, allofit, page):
        
        body = page
        
        scrape_result = 'good'
        logging.info("RBS page1")


        # the following is how you could retrieve the headers from the request
        # for head in allofit['headers']:
        #     name = self.HexToByte(head['name'])
        #     val = self.HexToByte(head['value'])
        
        
        # write out the start page
        self.output_page("1_first_page.html", body)


        soup = BeautifulSoup(body);

        frame = soup.find('frame', id='ctl00_secframe');

        if frame != None:

            action = self.urlBase + '/' + frame['src'];

            #<frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&amp;cookieid=100714"></frame>
        
            self.response = {}
            self.response['url'] = self.ByteToHex(action)
            self.response['data'] = ""
            self.response['method'] = 'GET'
            self.response['step'] = 2
        else:
            logging.debug('RBS frame link error - ')
            scrape_result = 'bank error'

        
        return scrape_result

Beispiel #28

0

Datei anzeigen

Datei: showList.py Projekt: Armedite/xbmc-catchuptv-au

 def __init__( self ):
     print 'Fetching %s' % self.INDEX_URL
     source = self._fetch_url(self.INDEX_URL)
     fodIndex = BeautifulSoup(source)
     showsListing = fodIndex.find('div',{"class":re.compile('group-full-eps')}).findAll('li')
     print 'Parsed listing and found %d shows' % len(showsListing)
     for show in showsListing:
         showLink = show.find('a')
         listitem=xbmcgui.ListItem(decode_htmlentities(showLink['title']))
         episodeCount = show.find('div',text=re.compile('^[0-9]+ Videos?$'))
         if episodeCount:
             episodeCount = int(re.search('^([0-9]+)\s*Videos?$',episodeCount.string).group(1))
             print 'Found "%s" with %d episodes' % (decode_htmlentities(showLink['title']),episodeCount)
             listitem.setInfo('video',{'episode':episodeCount})
         else: 
             print 'Found "%s" but did not find how many episodes' % decode_htmlentities(showLink['title'])
         listitem.setThumbnailImage(showLink.find('img')['src'])
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL,'')
         xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?showUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl)),totalItems=len(showsListing),isFolder=True)
     xbmcplugin.setContent(handle=int(sys.argv[1]), content='tvshows')
     xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )

Beispiel #29

0

Datei anzeigen

 def __init__(self):
     print 'Fetching %s' % self.INDEX_URL
     source = self._fetch_url(self.INDEX_URL)
     fodIndex = BeautifulSoup(source)
     showsListing = fodIndex.find('div', {
         "class": re.compile('group-full-eps')
     }).findAll('li')
     print 'Parsed listing and found %d shows' % len(showsListing)
     for show in showsListing:
         showLink = show.find('a')
         listitem = xbmcgui.ListItem(decode_htmlentities(showLink['title']))
         episodeCount = show.find('div',
                                  text=re.compile('^[0-9]+ Videos?$'))
         if episodeCount:
             episodeCount = int(
                 re.search('^([0-9]+)\s*Videos?$',
                           episodeCount.string).group(1))
             print 'Found "%s" with %d episodes' % (decode_htmlentities(
                 showLink['title']), episodeCount)
             listitem.setInfo('video', {'episode': episodeCount})
         else:
             print 'Found "%s" but did not find how many episodes' % decode_htmlentities(
                 showLink['title'])
         listitem.setThumbnailImage(showLink.find('img')['src'])
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL, '')
         xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]),
                                     listitem=listitem,
                                     url="%s?showUrl=%s" %
                                     (sys.argv[0], quote_plus(showUrl)),
                                     totalItems=len(showsListing),
                                     isFolder=True)
     xbmcplugin.setContent(handle=int(sys.argv[1]), content='tvshows')
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Beispiel #30

0

Datei anzeigen

Datei: rbs_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def doStep4(self, allofit, page):
        
        scrape_result = "good"
        
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)
        
        # write out the start page
        self.output_page("RBS-pos-accounts.html", page)
        
        scrape_result = 'good'
        logging.info("RBS message or bad cred check ")

        # if we still have the input then def bad credentials 
        errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI6PPEA_edit'})

        if len(errorDiv) != 0:
            logging.info("RBS defiantely bad credentials")
            return 'credentials incorrect' 


        accountBLock=soup.findAll('table', attrs={'class' : 'AccountTable'})
        # got some acount details so all good
        if len(accountBLock) > 0:
            logging.debug("RBS defiantely got some good accounts")
            return 'good';

        # find any link

        # if we find a link return it 

        # check for the normal continue button and fail all else - with credentials failure
        continueButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_FinishButton_button'})

        if(continueButton == None):
            logging.warning("RBS cant find finish button credentials incorrect")

            nextButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_NextButton_button'})

            if(nextButton == None):
                logging.warning("RBS cant find next button either")
                return  'credentials incorrect'


        # now find the form that these buttons belong to
        loginform=soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('RBS no continue form')
            return 'bank error'

        action = self.urlBase + '/' + loginform['action']
    
        # any hidden values etc        
        values = self.parseForm(loginform)
        

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 4
        
        return 'messages'

Beispiel #31

0

Datei anzeigen

    def get(self):
        dates = self.request.get('date')
        froms = self.request.get("from")
        to = self.request.get("to")
        #froms= "BOM"
        #to = "CBJ"
        resp = urlfetch.fetch(
            "http://www.cleartrip.com/flights/results?from=" + froms + "&to=" +
            to + "&depart_date=" + dates +
            "&adults=1&childs=0&infants=0&dep_time=0&class=Economy&airline=&carrier=&x=57&y=16&flexi_search=no&tb=n"
        )
        soup = BS(resp.content)
        my_content = soup.find("script", {"id": "json"})
        string_resp = str(my_content).strip()
        #self.response.out.write(str(string_resp))
        resp_splitted = string_resp.split(';')
        #self.response.out.write(str(resp_splitted))
        self.response.headers['Content-Type'] = 'text/html'
        self.response.out.write('<html><body><table>')
        a = 2 - len(resp_splitted)

        #self.response.out.write(string_resp)
        #query2 = resp_splitted[-10].split('=')
        #self.response.out.write(query2[1])
        """content = eval(query2[1])
        self.response.out.write('<tr><td>Price</td>')
        self.response.out.write('<td>'+content['pr']+'</td></tr>')
        legs = content['legs']
        i = 0
        for leg in legs:
			self.response.out.write('<tr><td>Way '+str(i)+':</td></tr>')
			self.response.out.write('<td>'+leg['fr'] + "to "+ leg['to'] +'</td>')
			self.response.out.write('<tr><td>Arrival '+str(i)+':</td></tr>')
			self.response.out.write ('<td>'+leg['a']+'</td>')
			self.response.out.write('<tr><td>Departure '+str(i)+':</td></tr>')
			self.response.out.write ('<td>'+leg['dp']+'</td>')
			i+=1"""

        for query in range(a, -9):
            query2 = resp_splitted[query].strip().split('=')

            try:
                content = eval(query2[1])
                self.response.out.write("<tr><td>******************</td></tr>")
                self.response.out.write('<tr><td>Price</td>')
                self.response.out.write('<td>' + str(content.get('pr')) +
                                        '</td></tr>')
                legs = content.get('legs')
                i = 0
                for leg in legs:
                    i += 1
                    self.response.out.write('<tr><td>Way ' + str(i) + ':</td>')
                    self.response.out.write('<td>' + leg.get('fr') + " => " +
                                            leg['to'] + '</td></tr>')
                    self.response.out.write('<tr><td>Arrival ' + str(i) +
                                            ':</td>')
                    self.response.out.write('<td>' + str(leg.get('a')) +
                                            '</td></tr>')
                    self.response.out.write('<tr><td>Departure ' + str(i) +
                                            ':</td>')
                    self.response.out.write('<td>' + str(leg.get('dp')) +
                                            '</td></tr>')

            except:
                pass

Beispiel #32

0

Datei anzeigen

 def __init__(self):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL +
                              unquote_plus(params['seasonUrl']))
     showIndex = BeautifulSoup(source)
     tvshowcontainer = showIndex.find(
         'div',
         id=re.compile(
             'scet_header|scet_top|show-header|show-header-scet|^header$'))
     if tvshowcontainer == None:
         tvshowcontainer = showIndex.find(
             'div', {
                 'class':
                 re.compile(
                     'scet_header|scet_top|show-header|show-header-scet')
             })
     if tvshowcontainer != None:
         tvshowtitle = tvshowcontainer.find('h1').string
     else:
         tvshowtitle = re.search('var siteName = "(.+?)";', source).group(1)
     pages = 1
     if showIndex.find('div', {'class': re.compile('nbcu_pager')}):
         pageLinks = showIndex.find('div', {
             'class': re.compile('nbcu_pager')
         }).findAll('a', {'class': re.compile('nbcu_pager_page')})
         pages = len(pageLinks)
     for i in range(0, pages):
         if i > 0:
             source = self._fetch_url(self.BASE_URL + pageLinks[i]['href'])
             showIndex = BeautifulSoup(source)
         episodesListing = showIndex.find(
             'ul', {
                 'class': re.compile('scet_th_full')
             }).findAll('li')
         for episode in episodesListing:
             vidInfo = {'tvshowtitle': tvshowtitle, 'studio': 'NBC'}
             title = decode_htmlentities(
                 episode.find('p', {
                     'class': re.compile('list_full_det_title')
                 }).find('a').string)
             listitem = xbmcgui.ListItem(title)
             listitem.setThumbnailImage(episode.find('img')['src'])
             episodeLink = episode.find('a')
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL, '')
             if episode.find('p', {'class': re.compile('list_full_des')}):
                 vidInfo['plot'] = decode_htmlentities(
                     episode.find('p', {
                         'class': re.compile('list_full_des')
                     }).find('em').string)
             epNum = re.search(
                 '^Ep(?:\.\s*)?([0-9]{1,2})([0-9][0-9])(?:\s*:\s*)?(.+)$',
                 title)
             if epNum != None:
                 vidInfo['season'] = int(epNum.group(1))
                 vidInfo['episode'] = int(epNum.group(2))
             vidInfo['title'] = epNum.group(3)
             #airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'})))
             #seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'})))
             #episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'})))
             #vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2))
             #vidInfo['season'] = int(seasonNum.group(1))
             #vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             #vidInfo['duration'] = episodeNumAndDuration.group(2)
             #vidInfo['title'] = episode.find('h3').find('a').string
             #vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4))
             #print vidInfo
             listitem.setInfo("video", vidInfo)
             xbmcplugin.addDirectoryItem(
                 handle=int(sys.argv[1]),
                 listitem=listitem,
                 url="%s?episodeUrl=%s&episode=%s&season=%s" %
                 (sys.argv[0], quote_plus(episodeUrl), vidInfo['episode'],
                  vidInfo['season']))
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_EPISODE)
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_DATE)
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_LABEL)
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_DURATION)
     xbmcplugin.setContent(handle=int(sys.argv[1]), content='episodes')
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Beispiel #33

0

Datei anzeigen

Datei: rbs_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def _processNormAccount(self, raw, account_path, balance):

        soup = BeautifulSoup(raw)

        logging.debug('Norm ac path - ' + str(account_path) + ' - end' )

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug('Processing :) norm ' )

                builder = StatementBuilder(self.facade, account_path, self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[self.current_statement]

                # we know this is not a credit card
                isCCard = False

                # get a fixed balance somewhere??
                # balance passed in for RBS

                # set up our statement
                self.statementbuilder.make_recent_dif_statement('RBS-recent', 'Scraper', None)

                

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)


                # now find all the recent transactions
                x_table = soup.find('table', attrs={'class' : 'ItemTable'})

                if x_table == None:
                    # could easily be no transactions
                    logging.debug(" No xtable ======>")


                if x_table != None:
                    x_body = x_table.find('tbody')
                    inputs = x_body.findAll('tr')

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll('td')
                        if vals:
                            cash = ''
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:
                                    logging.debug("date ======> " + data)

                                    dp = DateParser()
                                    try:
                                        atts['date'] = dp.ymd_from_date(dp.date_from_small(data))
                                    except:
                                        atts['date'] == ''
                                if i == 1:
                                    if data == 'ATM':
                                        cash = 'CASH - '

                                if i == 2:
                                    if data != "":
                                        extra = ""
                                        datebit = ""
                                        parts = data.split(',')
                                        if len(parts) > 1:
                                            # match RBS dates - a.la. 8062 14APR11
                                            if re.match('\d{4}\s\d\d[A-Z]{3}\d\d', parts[0]) != None:
                                                datebit = parts[0][0:4] + ' ' + parts[0][5:7] + ' ' + parts[0][7:10]
                                                # remember pretty_display strips out any words containing a sequence of 3 or more numbers

                                                parts = parts[1:]

                                        if len(parts) > 1:
                                            extra = parts[-1]
                                            parts = parts[0:-1]

                                        data = ' '.join(parts)

                                        disp =  (cash + data).strip()

                                        atts['display'] = " ".join(disp.split())

                                        atts['extradisplay'] = " ".join( (extra + " " + datebit).split())

                                if i > 2:   # the numbers

                                    if data != "" and data != '-':
                                        logging.debug('->' + data + '<-')
                                        amount = self.normalise_ammount(data)

                                        if i == 3:
                                            atts['amount'] = amount
                                            atts['type'] = 'Credit'

                                        if i == 4:
                                            atts['amount'] = amount
                                            atts['type'] = 'Debit'

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)

            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception('RBS parsing error - ' + str(e))

Beispiel #34

0

Datei anzeigen

Datei: rbs_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def _processCCAccount(self, raw, account_path, balance):
        soup = BeautifulSoup(raw)

        logging.debug('CC ac path - ' + str(account_path) + ' - end' )

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug('Processing :) ' )

                builder = StatementBuilder(self.facade, account_path, self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[self.current_statement]

                # we know this is not a credit card
                isCCard = True

                # get a fixed balance somewhere??
                # passed in for RBS

                # set up our statement
                self.statementbuilder.make_recent_dif_statement('RBS-recent', 'Scraper', None)

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)

                # now find all the recent transactions
                x_table = soup.find('table', attrs={'class' : 'ItemTable'})

                if x_table != None:
                    x_body = x_table.find('tbody')
                    inputs = x_body.findAll('tr')

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll('td')
                        if vals:
                            datebit = ''
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:
                                    logging.debug("date ======> " + data)

                                    dp = DateParser()
                                    try:
                                        atts['date'] = dp.ymd_from_date(dp.date_from_small(data))
                                    except:
                                        atts['date'] == ''

                                if i == 1:
                                    datebit = data[:-5]

                                if i == 2:
                                    if data != 'SALE':           # only keep extra xact date for Sales
                                        datebit = ''

                                if i == 3:
                                    if data != "":
                                        atts['display'] =  " ".join(data.split())
                                        atts['extradisplay'] = datebit

                                if i > 3:   # the numbers

                                    if data != "" and data != '-':
                                        amount = self.normalise_ammount(data)

                                        if i == 4:
                                            atts['amount'] = amount
                                            atts['type'] = 'Credit'

                                        if i == 5:
                                            atts['amount'] = amount
                                            atts['type'] = 'Debit'

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)


            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception('RBS parsing error - ' + str(e))

Beispiel #35

0

Datei anzeigen

    def processAccount(self, acCount, acName, account_path, allofit):

        page = self.HexToByte(allofit['body'])

        # save this page
        self.output_page("account" + str(acCount) + ".html", page)

        soup = BeautifulSoup(page)

        logging.debug('ac path - ' + str(account_path) + ' - end')

        if account_path != "":
            # delete existing current xactions

            logging.debug('Processing :) ')

            self.statementbuilder = StatementBuilder(self.facade, account_path,
                                                     self.token)

            # need to get last statement and make a new one every time
            self.statementbuilder.make_recent_dif_statement(
                'Fd-recent', 'Scraper', None)  #TODO change this

            isVisa = False
            loginform = soup.find(
                'input', attrs={'name': 'cmd_sort_referenceAscending'})
            if loginform != None:
                isVisa = True

                bal_tables = soup.findAll(
                    'table', attrs={'class': 'fdTableBackgroundOne'})
                balance_table = bal_tables[2]

                if balance_table <> None:
                    vals = balance_table.findAll('td')

                    if vals:
                        bal = vals[1].text
                        data = bal.replace('&#163;', u'£')
                        data = data.strip(u'£')
                        if data[-1] == 'D':
                            data = data.replace('DB', '')
                            data = data.replace('D', '')
                            lastbal = int(float(data) * 100)
                            firstbal = 0 - lastbal
                        else:
                            data = data.replace('CR', '')
                            data = data.replace('C', '')
                            firstbal = int(float(data) * 100)

                        self.statementbuilder.set_current_balance(firstbal)

            logging.debug(
                "-----------------------------*******---------------------")
            if isVisa:
                logging.debug("found visa --")

            acTable = soup.find('table', attrs={'class': 'fdStatTable'})

            # if no table then no new data afaik
            if acTable != None:
                datarows = acTable.findAll('tr')

                next = False

                # build the post values up
                atts = {}

                isFirst = True
                firstbal = 0
                firstdate = ""

                lastbal = 0
                lastdate = ""

                doBalance = False

                dp = DateParser()

                for rows in datarows:
                    vals = rows.findAll('td')

                    if vals:
                        for i, val in enumerate(vals):

                            if val.text:
                                data = val.text.strip()
                                data = unescape(data)
                                data = unicode(data)

                            else:
                                data = ""

                            if data != "&nbsp;":
                                data = data.replace('&nbsp;', '')
                                if i == 0:
                                    if data != "":
                                        try:
                                            lastdate = dp.ymd_from_date(
                                                dp.date_from_dmy(data, '/'))
                                        except:
                                            logging.warn(
                                                "Invalid FD date format - probably no transactions"
                                            )
                                            return

                                        if firstdate == "":
                                            firstdate = lastdate

                                    atts['date'] = lastdate

                                if (i == 1 and not isVisa) or (i == 2
                                                               and isVisa):
                                    atts['display'] = data[0:19]
                                    atts['extradisplay'] = data[19:]

                                if (i == 2 and not isVisa) or (i == 3
                                                               and isVisa):
                                    if data != "":
                                        data = data.strip(u'£')
                                        data = data.strip(u'D')
                                        data = data.strip(u'B')
                                        if data == '':
                                            atts['amount'] = 0
                                        else:
                                            atts['amount'] = int(
                                                float(data) * 100)
                                        atts['type'] = 'Debit'

                                if (i == 3 and not isVisa) or (i == 4
                                                               and isVisa):
                                    if data != "":
                                        data = data.strip(u'£')
                                        data = data.strip(u'C')
                                        data = data.strip(u'R')
                                        if data == '':
                                            atts['amount'] = 0
                                        else:
                                            atts['amount'] = int(
                                                float(data) * 100)
                                        atts['type'] = 'Credit'

                                if not isVisa:
                                    if i == 4:
                                        data = data.strip(u'£')
                                        if data != "":
                                            lastbal = int(float(data) * 100)

                                            if isFirst:
                                                isFirst = False
                                                firstbal = lastbal
                                                doBalance = True

                                    if i == 5:
                                        if doBalance:
                                            doBalance = False
                                            if data == "D":
                                                firstbal = 0 - firstbal
                                            self.statementbuilder.set_current_balance(
                                                firstbal)

                        self.statementbuilder.make_xact(atts)

                self.statementbuilder.put_statement()
                self.current_statement = self.current_statement + 1

Beispiel #36

0

Datei anzeigen

Datei: natwest_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def _processNormAccount(self, raw, account_path, balance):

        soup = BeautifulSoup(raw)

        logging.debug("Norm ac path - " + str(account_path) + " - end")

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug("Processing :) norm ")

                builder = StatementBuilder(self.facade, account_path, self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[self.current_statement]

                # we know this is not a credit card
                isCCard = False

                # get a fixed balance somewhere??
                # balance passed in for natwest

                # set up our statement
                self.statementbuilder.make_recent_dif_statement("NatWest-recent", "Scraper", None)

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)

                # now find all the recent transactions
                x_table = soup.find("table", attrs={"class": "ItemTable"})

                if x_table == None:
                    # could easily be no transactions
                    logging.debug(" No xtable ======>")

                if x_table != None:
                    x_body = x_table.find("tbody")
                    inputs = x_body.findAll("tr")

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll("td")
                        if vals:
                            cash = ""
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:

                                    dp = DateParser()
                                    try:
                                        atts["date"] = dp.ymd_from_date(dp.date_from_small(data))
                                    except:
                                        atts["date"] == ""
                                if i == 1:
                                    if data == "ATM":
                                        cash = "CASH - "

                                if i == 2:
                                    if data != "":
                                        extra = ""
                                        datebit = ""
                                        parts = data.split(",")
                                        if len(parts) > 1:
                                            # match natwest dates - a.la. 8062 14APR11
                                            if re.match("\d{4}\s\d\d[A-Z]{3}\d\d", parts[0]) != None:
                                                datebit = parts[0][0:4] + " " + parts[0][5:7] + " " + parts[0][7:10]
                                                # remember pretty_display strips out any words containing a sequence of 3 or more numbers

                                                parts = parts[1:]

                                        if len(parts) > 1:
                                            extra = parts[-1]
                                            parts = parts[0:-1]

                                        data = " ".join(parts)

                                        disp = (cash + data).strip()

                                        atts["display"] = " ".join(disp.split())

                                        atts["extradisplay"] = " ".join((extra + " " + datebit).split())

                                if i > 2:  # the numbers

                                    if data != "" and data != "-":

                                        amount = self.normalise_ammount(data)

                                        if i == 3:
                                            atts["amount"] = amount
                                            atts["type"] = "Credit"

                                        if i == 4:
                                            atts["amount"] = amount
                                            atts["type"] = "Debit"

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)

            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception("NatWest parsing error - " + str(e))

Beispiel #37

0

Datei anzeigen

Datei: prontuario_veiculo.py Projekt: pedropaulovc/APIDetranSC

class ProntuarioVeiculo(object):
    '''
    classdocs
    '''

    def __init__(self, prontuario_html):
        '''
        Constructor
        '''
        
        self.__soup = \
            BeautifulSoup(prontuario_html, fromEncoding='iso-8859-1', 
                          convertEntities=BeautifulSoup.HTML_ENTITIES)
        self.__prontuario = {}
        
        self._parsear_dados_veiculo()
        self._parsear_debitos()
        self._parsear_infracoes_em_autuacao()
        self._parsear_listagem_multas()
        self._parsear_historico_multas()
        self._parsear_ultimo_processo()
        self._parsear_recurso_infracao()
    
    def obter_prontuario(self):
        return deepcopy(self.__prontuario)
            
    def _parsear_dados_veiculo(self):
        tabela = self.__soup.find('div', id='div_servicos_02' ).table.tbody
        
        for celula in tabela.findAll('td'):
            dado = celula.findAll(text=True)
            if len(dado) == 2:
                self.__prontuario[dado[0].strip()] = dado[1].strip()
        
    def _parsear_debitos(self):
        tabela = self.__soup.find('div', id='div_servicos_03' ).table.tbody
        
        debitos = []
        for linha in tabela.findAll('tr')[1:-1]:
            debito = {}
            
            texto = linha.td.findAll(text=True) 
            if texto == None:
                texto = ''
            debito[u'Classe'] = ''.join(texto).strip()
            
            link = ''
            if linha.td.a != None:
                link = linha.td.a['href'].strip()
            debito[u'Link'] = link
            
            celulas = [u'Número DetranNet', u'Vencimento', u'Valor Nominal(R$)',
                        u'Multa(R$)', u'Juros(R$)', u'Valor Atual(R$)']
            for celula, valor in zip(celulas, linha.findAll('td')[1:]):
                debito[celula] = valor.string.strip()
            
            debitos.append(debito)
        
        self.__prontuario[u'Débitos'] = debitos

    def _parsear_infracoes_em_autuacao(self):
        tabela = self.__soup.find('div', id='div_servicos_10' ).table.tbody
        
        celula_filha = lambda tag: tag.name == 'td' and tag.table == None
        celulas = tabela.findAll(celula_filha)[3:]
        
        infracoes = []
        for i in range(len(celulas)/7):
            linha = 7 * i
            
            infracao = {}
            infracao[u'Número'] = celulas[linha].a.string
            infracao[u'Link'] = celulas[linha].a['href'].strip()
            infracao[u'Valor'] = celulas[linha + 1].string.strip()
            infracao[u'Situação'] = celulas[linha + 2].string.strip()
            infracao[u'Descrição 1'] = celulas[linha + 3].string.strip()
            infracao[u'Descrição 2'] = celulas[linha + 4].string.strip()
            infracao[u'Local/Complemento 1'] = celulas[linha + 5].string.strip()
            infracao[u'Local/Complemento 2'] = celulas[linha + 6].string
            if infracao[u'Local/Complemento 2'] == None:
                infracao[u'Local/Complemento 2'] = u''
            infracao[u'Local/Complemento 2'] = infracao[u'Local/Complemento 2'].strip()
            
            infracoes.append(infracao)
            
        self.__prontuario[u'Infrações em Autuação'] = infracoes

    #TODO: Implementar
    def _parsear_listagem_multas(self):
        tabela = self.__soup.find('div', id='div_servicos_04' ).table.tbody
        
        if tabela.tr.td.find(text=re.compile(u'Nenhuma?')):
            self.__prontuario[u'Listagem de Multas'] = []
            return

    def _parsear_historico_multas(self):
        tabela = self.__soup.find('div', id='div_servicos_07' ).table.tbody
        
        celula_filha = lambda tag: tag.name == 'td' and tag.table == None
        celulas = tabela.findAll(celula_filha)[3:]
        
        multas = []
        for i in range(len(celulas)/7):
            linha = 7 * i
            
            multa = {}
            multa[u'Número'] = celulas[linha].a.string
            multa[u'Link'] = celulas[linha].a['href'].strip()
            multa[u'Lançamento'] = celulas[linha + 1].string.strip()
            multa[u'Pagamento'] = celulas[linha + 2].string.strip()
            multa[u'Descrição 1'] = celulas[linha + 3].string.strip()
            multa[u'Descrição 2'] = celulas[linha + 4].string.strip()
            multa[u'Local/Complemento 1'] = celulas[linha + 5].string.strip()
            multa[u'Local/Complemento 2'] = celulas[linha + 6].string
            if multa[u'Local/Complemento 2'] == None:
                multa[u'Local/Complemento 2'] = u''
            multa[u'Local/Complemento 2'] = multa[u'Local/Complemento 2'].strip()
            
            multas.append(multa)
            
        self.__prontuario[u'Histórico de Multas'] = multas

    def _parsear_ultimo_processo(self):
        tabela = self.__soup.find('div', id='div_servicos_11' ).table.tbody
        
        ultimo_processo = {}
        celulas = tabela.findAll('td')
        for celula in celulas[:5]:
            dado = celula.findAll(text=True)
            ultimo_processo[dado[0]] = dado[1]
        for i in range(7, len(celulas), 2):
            chave = celulas[i].findAll(text=True)[0]
            valor = celulas[i + 1].findAll(text=True)[0]
            ultimo_processo[chave] = valor
            
        self.__prontuario[u'Último Processo'] = ultimo_processo

    #TODO: Implementar
    def _parsear_recurso_infracao(self):
        tabela = self.__soup.find('div', id='div_servicos_09' ).table.tbody
        
        if tabela.tr.td.find(text=re.compile(u'Nenhuma?')):
            self.__prontuario[u'Recurso de Infração'] = []
            return

Beispiel #38

0

Datei anzeigen

    def _processNormAccount(self, raw, account_path, balance):

        soup = BeautifulSoup(raw)

        logging.debug('Norm ac path - ' + str(account_path) + ' - end')

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug('Processing :) norm ')

                builder = StatementBuilder(self.facade, account_path,
                                           self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[
                    self.current_statement]

                # we know this is not a credit card
                isCCard = False

                # get a fixed balance somewhere??
                # balance passed in for natwest

                # set up our statement
                self.statementbuilder.make_recent_dif_statement(
                    'NatWest-recent', 'Scraper', None)

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)

                # now find all the recent transactions
                x_table = soup.find('table', attrs={'class': 'ItemTable'})

                if x_table == None:
                    # could easily be no transactions
                    logging.debug(" No xtable ======>")

                if x_table != None:
                    x_body = x_table.find('tbody')
                    inputs = x_body.findAll('tr')

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll('td')
                        if vals:
                            cash = ''
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:

                                    dp = DateParser()
                                    try:
                                        atts['date'] = dp.ymd_from_date(
                                            dp.date_from_small(data))
                                    except:
                                        atts['date'] == ''
                                if i == 1:
                                    if data == 'ATM':
                                        cash = 'CASH - '

                                if i == 2:
                                    if data != "":
                                        extra = ""
                                        datebit = ""
                                        parts = data.split(',')
                                        if len(parts) > 1:
                                            # match natwest dates - a.la. 8062 14APR11
                                            if re.match(
                                                    '\d{4}\s\d\d[A-Z]{3}\d\d',
                                                    parts[0]) != None:
                                                datebit = parts[0][
                                                    0:4] + ' ' + parts[0][
                                                        5:7] + ' ' + parts[0][
                                                            7:10]
                                                # remember pretty_display strips out any words containing a sequence of 3 or more numbers

                                                parts = parts[1:]

                                        if len(parts) > 1:
                                            extra = parts[-1]
                                            parts = parts[0:-1]

                                        data = ' '.join(parts)

                                        disp = (cash + data).strip()

                                        atts['display'] = " ".join(
                                            disp.split())

                                        atts['extradisplay'] = " ".join(
                                            (extra + " " + datebit).split())

                                if i > 2:  # the numbers

                                    if data != "" and data != '-':

                                        amount = self.normalise_ammount(data)

                                        if i == 3:
                                            atts['amount'] = amount
                                            atts['type'] = 'Credit'

                                        if i == 4:
                                            atts['amount'] = amount
                                            atts['type'] = 'Debit'

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)

            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception('NatWest parsing error - ' + str(e))

Beispiel #39

0

Datei anzeigen

    def _processCCAccount(self, raw, account_path, balance):
        soup = BeautifulSoup(raw)

        logging.debug('CC ac path - ' + str(account_path) + ' - end')

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug('Processing :) ')

                builder = StatementBuilder(self.facade, account_path,
                                           self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[
                    self.current_statement]

                # we know this is not a credit card
                isCCard = True

                # get a fixed balance somewhere??
                # passed in for natwest

                # set up our statement
                self.statementbuilder.make_recent_dif_statement(
                    'NatWest-recent', 'Scraper', None)

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)

                # now find all the recent transactions
                x_table = soup.find('table', attrs={'class': 'ItemTable'})

                if x_table != None:
                    x_body = x_table.find('tbody')
                    inputs = x_body.findAll('tr')

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll('td')
                        if vals:
                            datebit = ''
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:
                                    dp = DateParser()
                                    try:
                                        atts['date'] = dp.ymd_from_date(
                                            dp.date_from_small(data))
                                    except:
                                        atts['date'] == ''

                                if i == 1:
                                    datebit = data[:-5]

                                if i == 2:
                                    if data != 'SALE':  # only keep extra xact date for Sales
                                        datebit = ''

                                if i == 3:
                                    if data != "":
                                        atts['display'] = " ".join(
                                            data.split()).encode('utf8')
                                        atts['extradisplay'] = datebit.encode(
                                            'utf8')

                                if i > 3:  # the numbers

                                    if data != "" and data != '-':
                                        amount = self.normalise_ammount(data)

                                        if i == 4:
                                            atts['amount'] = amount
                                            atts['type'] = 'Credit'

                                        if i == 5:
                                            atts['amount'] = amount
                                            atts['type'] = 'Debit'

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)

            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception('NatWest parsing error - ' + str(e))

Beispiel #40

0

Datei anzeigen

Datei: natwest_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def doStep4(self, allofit, page):

        scrape_result = "good"

        # -------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("natwest-pos-accounts.html", page)

        scrape_result = "good"
        logging.info("NatWest message or bad cred check ")

        # if we still have the input then def bad credentials
        errorDiv = soup.findAll("input", attrs={"name": "ctl00$mainContent$LI6PPEA_edit"})

        if len(errorDiv) != 0:
            logging.info("NatWest defiantely bad credentials")
            return "credentials incorrect"

        accountBLock = soup.findAll("table", attrs={"class": "AccountTable"})
        # got some acount details so all good
        if len(accountBLock) > 0:
            logging.debug("NatWest defiantely got some good accounts")
            return "good"

        # find any link

        # if we find a link return it

        # check for the normal continue button and fail all else - with credentials failure
        continueButton = soup.find("input", attrs={"id": "ctl00_mainContent_FinishButton_button"})

        if continueButton == None:
            logging.warning("NatWest cant find finish button credentials incorrect")

            nextButton = soup.find("input", attrs={"id": "ctl00_mainContent_NextButton_button"})

            if nextButton == None:
                logging.warning("NatWest cant find next button either")
                return "credentials incorrect"

        # now find the form that these buttons belong to
        loginform = soup.find("form", attrs={"name": "aspnetForm"})

        if loginform == None:
            logging.debug("NatWest no continue form")
            return "bank error"
        else:
            logging.debug("found a continue form - so clicking it")
        action = self.urlBase + "/" + loginform["action"]

        # any hidden values etc
        values = self.parseForm(loginform)

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response["url"] = self.ByteToHex(action)
        self.response["data"] = self.ByteToHex(data)
        self.response["method"] = "POST"
        self.response["step"] = 4

        return "messages"

Beispiel #41

0

Datei anzeigen

Datei: natwest_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def _processCCAccount(self, raw, account_path, balance):
        soup = BeautifulSoup(raw)

        logging.debug("CC ac path - " + str(account_path) + " - end")

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug("Processing :) ")

                builder = StatementBuilder(self.facade, account_path, self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[self.current_statement]

                # we know this is not a credit card
                isCCard = True

                # get a fixed balance somewhere??
                # passed in for natwest

                # set up our statement
                self.statementbuilder.make_recent_dif_statement("NatWest-recent", "Scraper", None)

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)

                # now find all the recent transactions
                x_table = soup.find("table", attrs={"class": "ItemTable"})

                if x_table != None:
                    x_body = x_table.find("tbody")
                    inputs = x_body.findAll("tr")

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll("td")
                        if vals:
                            datebit = ""
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:
                                    dp = DateParser()
                                    try:
                                        atts["date"] = dp.ymd_from_date(dp.date_from_small(data))
                                    except:
                                        atts["date"] == ""

                                if i == 1:
                                    datebit = data[:-5]

                                if i == 2:
                                    if data != "SALE":  # only keep extra xact date for Sales
                                        datebit = ""

                                if i == 3:
                                    if data != "":
                                        atts["display"] = " ".join(data.split()).encode("utf8")
                                        atts["extradisplay"] = datebit.encode("utf8")

                                if i > 3:  # the numbers

                                    if data != "" and data != "-":
                                        amount = self.normalise_ammount(data)

                                        if i == 4:
                                            atts["amount"] = amount
                                            atts["type"] = "Credit"

                                        if i == 5:
                                            atts["amount"] = amount
                                            atts["type"] = "Debit"

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)

            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception("NatWest parsing error - " + str(e))

Beispiel #42

0

Datei anzeigen

Datei: first_direct_scraper.py Projekt: JamieMcNaught/Bank-Scraper

    def processAccount(self, acCount, acName, account_path, allofit):
        
        page = self.HexToByte( allofit['body'])
        
        # save this page
        self.output_page("account" + str(acCount) + ".html", page) 
        
        soup = BeautifulSoup(page)
            
        logging.debug('ac path - ' + str(account_path) + ' - end' )
        
        if account_path != "":
            # delete existing current xactions
            
            logging.debug('Processing :) ' )
            
            self.statementbuilder = StatementBuilder(self.facade, account_path, self.token)
           
            # need to get last statement and make a new one every time
            self.statementbuilder.make_recent_dif_statement('Fd-recent', 'Scraper', None) #TODO change this 
                        
            isVisa = False
            loginform=soup.find('input', attrs={'name' : 'cmd_sort_referenceAscending'})
            if loginform != None:
                isVisa = True
                
                bal_tables=soup.findAll('table', attrs={'class' : 'fdTableBackgroundOne'})
                balance_table = bal_tables[2]

                if balance_table <> None:
                    vals = balance_table.findAll('td')

                    if vals:
                        bal = vals[1].text
                        data = bal.replace('&#163;', u'£');
                        data = data.strip(u'£')
                        if data[-1] == 'D':
                            data = data.replace('DB','')
                            data = data.replace('D','')
                            lastbal = int( float(data) * 100 )
                            firstbal = 0 - lastbal
                        else:
                            data = data.replace('CR','')
                            data = data.replace('C','')
                            firstbal = int( float(data) * 100 )
                        
                        self.statementbuilder.set_current_balance(firstbal)    
                   
            
            logging.debug("-----------------------------*******---------------------")
            if isVisa:
                logging.debug("found visa --")
            
            acTable=soup.find('table', attrs={'class' : 'fdStatTable'})
            
            # if no table then no new data afaik
            if acTable != None:
               datarows=acTable.findAll('tr')
               
               next = False
               
                
               # build the post values up
               atts = {}
               
               isFirst = True
               firstbal = 0
               firstdate = ""
               
               lastbal = 0
               lastdate = ""
               
               doBalance = False
               
               dp = DateParser()
                           
               for rows in datarows:
                   vals = rows.findAll('td')
                   
                   if vals:
                       for i, val in enumerate(vals):
                           
                           if val.text:
                               data = val.text.strip()
                               data = unescape(data)
                               data = unicode(data)
                               
                           else:
                               data = ""
                           
                           if data != "&nbsp;":
                               data = data.replace('&nbsp;','')
                               if i == 0:
                                   if data != "":
                                       try:
                                           lastdate = dp.ymd_from_date(dp.date_from_dmy(data,'/'))
                                       except:
                                           logging.warn("Invalid FD date format - probably no transactions")
                                           return
                                       
                                       if firstdate == "":
                                           firstdate = lastdate
                                       
                                   atts['date'] = lastdate
                                   
                               if (i == 1 and not isVisa) or (i == 2 and isVisa):
                                       atts['display'] = data[0:19]
                                       atts['extradisplay'] = data[19:]
                                   
                               if (i == 2 and not isVisa) or (i == 3 and isVisa):
                                   if data != "":
                                       data = data.strip(u'£')
                                       data = data.strip(u'D')
                                       data = data.strip(u'B')
                                       if data == '':
                                           atts['amount'] = 0
                                       else:
                                           atts['amount'] = int( float(data) * 100 )
                                       atts['type'] = 'Debit'
                                           
                               if (i == 3 and not isVisa) or (i == 4 and isVisa):
                                   if data != "":
                                       data = data.strip(u'£')
                                       data = data.strip(u'C')
                                       data = data.strip(u'R')
                                       if data == '':
                                           atts['amount'] = 0
                                       else:
                                           atts['amount'] = int( float(data) * 100 )
                                       atts['type'] = 'Credit'
                                       
                               if not isVisa:
                                   if i == 4:
                                       data = data.strip(u'£')
                                       if data != "":
                                           lastbal = int( float(data) * 100 )
                                           
                                           if isFirst:
                                               isFirst = False
                                               firstbal = lastbal
                                               doBalance = True
                                               
                                   if i == 5:
                                       if doBalance:
                                           doBalance = False
                                           if data == "D":
                                               firstbal = 0 - firstbal
                                           self.statementbuilder.set_current_balance(firstbal) 
                                       
                       self.statementbuilder.make_xact(atts)
           
               self.statementbuilder.put_statement()
               self.current_statement = self.current_statement + 1

Beispiel #43

0

Datei anzeigen

class ProntuarioVeiculo(object):
    '''
    classdocs
    '''
    def __init__(self, prontuario_html):
        '''
        Constructor
        '''

        self.__soup = \
            BeautifulSoup(prontuario_html, fromEncoding='iso-8859-1',
                          convertEntities=BeautifulSoup.HTML_ENTITIES)
        self.__prontuario = {}

        self._parsear_dados_veiculo()
        self._parsear_debitos()
        self._parsear_infracoes_em_autuacao()
        self._parsear_listagem_multas()
        self._parsear_historico_multas()
        self._parsear_ultimo_processo()
        self._parsear_recurso_infracao()

    def obter_prontuario(self):
        return deepcopy(self.__prontuario)

    def _parsear_dados_veiculo(self):
        tabela = self.__soup.find('div', id='div_servicos_02').table.tbody

        for celula in tabela.findAll('td'):
            dado = celula.findAll(text=True)
            if len(dado) == 2:
                self.__prontuario[dado[0].strip()] = dado[1].strip()

    def _parsear_debitos(self):
        tabela = self.__soup.find('div', id='div_servicos_03').table.tbody

        debitos = []
        for linha in tabela.findAll('tr')[1:-1]:
            debito = {}

            texto = linha.td.findAll(text=True)
            if texto == None:
                texto = ''
            debito[u'Classe'] = ''.join(texto).strip()

            link = ''
            if linha.td.a != None:
                link = linha.td.a['href'].strip()
            debito[u'Link'] = link

            celulas = [
                u'Número DetranNet', u'Vencimento', u'Valor Nominal(R$)',
                u'Multa(R$)', u'Juros(R$)', u'Valor Atual(R$)'
            ]
            for celula, valor in zip(celulas, linha.findAll('td')[1:]):
                debito[celula] = valor.string.strip()

            debitos.append(debito)

        self.__prontuario[u'Débitos'] = debitos

    def _parsear_infracoes_em_autuacao(self):
        tabela = self.__soup.find('div', id='div_servicos_10').table.tbody

        celula_filha = lambda tag: tag.name == 'td' and tag.table == None
        celulas = tabela.findAll(celula_filha)[3:]

        infracoes = []
        for i in range(len(celulas) / 7):
            linha = 7 * i

            infracao = {}
            infracao[u'Número'] = celulas[linha].a.string
            infracao[u'Link'] = celulas[linha].a['href'].strip()
            infracao[u'Valor'] = celulas[linha + 1].string.strip()
            infracao[u'Situação'] = celulas[linha + 2].string.strip()
            infracao[u'Descrição 1'] = celulas[linha + 3].string.strip()
            infracao[u'Descrição 2'] = celulas[linha + 4].string.strip()
            infracao[u'Local/Complemento 1'] = celulas[linha +
                                                       5].string.strip()
            infracao[u'Local/Complemento 2'] = celulas[linha + 6].string
            if infracao[u'Local/Complemento 2'] == None:
                infracao[u'Local/Complemento 2'] = u''
            infracao[u'Local/Complemento 2'] = infracao[
                u'Local/Complemento 2'].strip()

            infracoes.append(infracao)

        self.__prontuario[u'Infrações em Autuação'] = infracoes

    #TODO: Implementar
    def _parsear_listagem_multas(self):
        tabela = self.__soup.find('div', id='div_servicos_04').table.tbody

        if tabela.tr.td.find(text=re.compile(u'Nenhuma?')):
            self.__prontuario[u'Listagem de Multas'] = []
            return

    def _parsear_historico_multas(self):
        tabela = self.__soup.find('div', id='div_servicos_07').table.tbody

        celula_filha = lambda tag: tag.name == 'td' and tag.table == None
        celulas = tabela.findAll(celula_filha)[3:]

        multas = []
        for i in range(len(celulas) / 7):
            linha = 7 * i

            multa = {}
            multa[u'Número'] = celulas[linha].a.string
            multa[u'Link'] = celulas[linha].a['href'].strip()
            multa[u'Lançamento'] = celulas[linha + 1].string.strip()
            multa[u'Pagamento'] = celulas[linha + 2].string.strip()
            multa[u'Descrição 1'] = celulas[linha + 3].string.strip()
            multa[u'Descrição 2'] = celulas[linha + 4].string.strip()
            multa[u'Local/Complemento 1'] = celulas[linha + 5].string.strip()
            multa[u'Local/Complemento 2'] = celulas[linha + 6].string
            if multa[u'Local/Complemento 2'] == None:
                multa[u'Local/Complemento 2'] = u''
            multa[u'Local/Complemento 2'] = multa[
                u'Local/Complemento 2'].strip()

            multas.append(multa)

        self.__prontuario[u'Histórico de Multas'] = multas

    def _parsear_ultimo_processo(self):
        tabela = self.__soup.find('div', id='div_servicos_11').table.tbody

        ultimo_processo = {}
        celulas = tabela.findAll('td')
        for celula in celulas[:5]:
            dado = celula.findAll(text=True)
            ultimo_processo[dado[0]] = dado[1]
        for i in range(7, len(celulas), 2):
            chave = celulas[i].findAll(text=True)[0]
            valor = celulas[i + 1].findAll(text=True)[0]
            ultimo_processo[chave] = valor

        self.__prontuario[u'Último Processo'] = ultimo_processo

    #TODO: Implementar
    def _parsear_recurso_infracao(self):
        tabela = self.__soup.find('div', id='div_servicos_09').table.tbody

        if tabela.tr.td.find(text=re.compile(u'Nenhuma?')):
            self.__prontuario[u'Recurso de Infração'] = []
            return

Beispiel #44

0

Datei anzeigen

    def doStep4(self, allofit, page):

        scrape_result = "good"

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("natwest-pos-accounts.html", page)

        scrape_result = 'good'
        logging.info("NatWest message or bad cred check ")

        # if we still have the input then def bad credentials
        errorDiv = soup.findAll(
            'input', attrs={'name': 'ctl00$mainContent$LI6PPEA_edit'})

        if len(errorDiv) != 0:
            logging.info("NatWest defiantely bad credentials")
            return 'credentials incorrect'

        accountBLock = soup.findAll('table', attrs={'class': 'AccountTable'})
        # got some acount details so all good
        if len(accountBLock) > 0:
            logging.debug("NatWest defiantely got some good accounts")
            return 'good'

        # find any link

        # if we find a link return it

        # check for the normal continue button and fail all else - with credentials failure
        continueButton = soup.find(
            'input', attrs={'id': 'ctl00_mainContent_FinishButton_button'})

        if (continueButton == None):
            logging.warning(
                "NatWest cant find finish button credentials incorrect")

            nextButton = soup.find(
                'input', attrs={'id': 'ctl00_mainContent_NextButton_button'})

            if (nextButton == None):
                logging.warning("NatWest cant find next button either")
                return 'credentials incorrect'

        # now find the form that these buttons belong to
        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no continue form')
            return 'bank error'
        else:
            logging.debug('found a continue form - so clicking it')
        action = self.urlBase + '/' + loginform['action']

        # any hidden values etc
        values = self.parseForm(loginform)

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 4

        return 'messages'