def doStep6(self, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("RBS-acclink.html", page) loginform=soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('RBS no view account form') return 'bank error' action = self.urlBase + '/' + loginform['action'] values = self.parseForm(loginform) # fill in our selection - 1 month values['ctl00$mainContent$SS2SPDDA'] = 'M1' # default button - needed values['ctl00$mainContent$NextButton_button'] = 'View Transactions' # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 20 return 'good'
def get(self): dates = self.request.get('date') froms = self.request.get("from") to = self.request.get("to") #froms= "BOM" #to = "CBJ" resp=urlfetch.fetch("http://www.cleartrip.com/flights/results?from="+froms+"&to="+to+"&depart_date="+dates+"&adults=1&childs=0&infants=0&dep_time=0&class=Economy&airline=&carrier=&x=57&y=16&flexi_search=no&tb=n") soup = BS(resp.content) my_content = soup.find("script",{"id":"json"}) string_resp = str(my_content).strip() #self.response.out.write(str(string_resp)) resp_splitted = string_resp.split(';') #self.response.out.write(str(resp_splitted)) self.response.headers['Content-Type'] = 'text/html' self.response.out.write('<html><body><table>') a = 2-len(resp_splitted) #self.response.out.write(string_resp) #query2 = resp_splitted[-10].split('=') #self.response.out.write(query2[1]) """content = eval(query2[1]) self.response.out.write('<tr><td>Price</td>') self.response.out.write('<td>'+content['pr']+'</td></tr>') legs = content['legs'] i = 0 for leg in legs: self.response.out.write('<tr><td>Way '+str(i)+':</td></tr>') self.response.out.write('<td>'+leg['fr'] + "to "+ leg['to'] +'</td>') self.response.out.write('<tr><td>Arrival '+str(i)+':</td></tr>') self.response.out.write ('<td>'+leg['a']+'</td>') self.response.out.write('<tr><td>Departure '+str(i)+':</td></tr>') self.response.out.write ('<td>'+leg['dp']+'</td>') i+=1""" for query in range(a,-9): query2 = resp_splitted[query].strip().split('=') try: content = eval(query2[1]) self.response.out.write("<tr><td>******************</td></tr>") self.response.out.write('<tr><td>Price</td>') self.response.out.write('<td>'+str(content.get('pr'))+'</td></tr>') legs = content.get('legs') i = 0 for leg in legs: i+=1 self.response.out.write('<tr><td>Way '+str(i)+':</td>') self.response.out.write('<td>'+leg.get('fr') + " => "+ leg['to'] +'</td></tr>') self.response.out.write('<tr><td>Arrival '+str(i)+':</td>') self.response.out.write ('<td>'+str(leg.get('a'))+'</td></tr>') self.response.out.write('<tr><td>Departure '+str(i)+':</td>') self.response.out.write ('<td>'+str(leg.get('dp'))+'</td></tr>') except: pass
def doAllLink(self, page): soup = BeautifulSoup(page) self.output_page("RBS-xactlist-all-look.html", page) #<a href="/StatementsFixedPeriod.aspx?id=B7879D8CABBF283B38AE447E07A4EA5D8DA9A859&persist=%2fwEPBQ1BY2NvdW50TnVtYmVyBQg4ODU3MjIxOA%3d%3d%7c%2fwEPBQhGcm9tRGF0ZQUTMTgvMDUvMjAxMSAwMDowMDowMA%3d%3d%7c%2fwEPBQhTb3J0Q29kZQUGNjAxNzIx%7c%2fwEPBQZUb0RhdGUFEzE4LzA3LzIwMTEgMDA6MDA6MDA%3d%7c%2fwEWBh4JU1MyQUNDRERBDwUUU0lOSEEgTS9TVFUgODg1NzIyMTgFKEI3ODc5RDhDQUJCRjI4M0IzOEFFNDQ3RTA3QTRFQTVEOERBOUE4NTkeCFNTMlNQRERBDxBkZBYBAgNnHgZTUzJXTEEPAgFo&showall=1" title="Show all items on a single page">All</a> logging.debug('RBS checking for all links') # find any all link links=soup.findAll('a') link = None for a in links: # detect our link try: if re.search(".tatements.ixed.eriod", a['href']): logging.debug("RBS - got a statement link") if re.search(".ll", a.text): # the one that says all link = self.composeLink(a['href'][:]) logging.debug("RBS - got an All statement link") break # only need the first one so break the for loop except Exception, e: logging.debug('RBS a link error missing href - ' + str(e))
def doStep1(self, allofit, page): body = page scrape_result = 'good' logging.info("NatWest page1") # the following is how you could retrieve the headers from the request # for head in allofit['headers']: # name = self.HexToByte(head['name']) # val = self.HexToByte(head['value']) # write out the start page self.output_page("1_first_page.html", body) soup = BeautifulSoup(body) frame = soup.find('frame', id='ctl00_secframe') if frame != None: action = self.urlBase + '/' + frame['src'] #<frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&cookieid=100714"></frame> self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = "" self.response['method'] = 'GET' self.response['step'] = 2 else: logging.debug('NatWest frame link error - ') scrape_result = 'bank error' return scrape_result
def doStep1(self, allofit, page): body = page scrape_result = "good" logging.info("NatWest page1") # the following is how you could retrieve the headers from the request # for head in allofit['headers']: # name = self.HexToByte(head['name']) # val = self.HexToByte(head['value']) # write out the start page self.output_page("1_first_page.html", body) soup = BeautifulSoup(body) frame = soup.find("frame", id="ctl00_secframe") if frame != None: action = self.urlBase + "/" + frame["src"] # <frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&cookieid=100714"></frame> self.response = {} self.response["url"] = self.ByteToHex(action) self.response["data"] = "" self.response["method"] = "GET" self.response["step"] = 2 else: logging.debug("NatWest frame link error - ") scrape_result = "bank error" return scrape_result
def doStep7(self, page): # -------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("natwest-acclink.html", page) loginform = soup.find("form", attrs={"name": "aspnetForm"}) if loginform == None: logging.debug("NatWest no view account form") return "bank error" action = self.urlBase + "/" + loginform["action"] values = self.parseForm(loginform) # fill in our selection - 1 month values["ctl00$mainContent$SS2SPDDA"] = "M1" # default button - needed values["ctl00$mainContent$NextButton_button"] = "View Transactions" # build the body content data = urllib.urlencode(values) self.response = {} self.response["url"] = self.ByteToHex(action) self.response["data"] = self.ByteToHex(data) self.response["method"] = "POST" self.response["step"] = 20 return "good"
def doStep2(self, allofit, page): # -------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("natwest-username.html", page) loginform = soup.find("form", attrs={"name": "aspnetForm"}) if loginform == None: logging.debug("NatWest no login form") return "bank error" action = self.urlBase + "/" + loginform["action"] values = self.parseForm(loginform) # fill in our credentials values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds["01"] # customer number # build the body content data = urllib.urlencode(values) self.response = {} self.response["url"] = self.ByteToHex(action) self.response["data"] = self.ByteToHex(data) self.response["method"] = "POST" self.response["step"] = 3 return "good"
def firstPass(self, page): soup = BeautifulSoup(page) loginform=soup.find('form') action = loginform['action'] urls = urlparse(action); self.urlBase = "https://" + urls.netloc logging.info("Base URL = " + self.urlBase) inputs = loginform.findAllNext('input') values = {} values['userid'] = self.filledCreds['03'] #username # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 2
def __init__( self ): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl'])) print 'Loading ' + self.BASE_URL + unquote_plus(params['showUrl']) seasonIndex = BeautifulSoup(source) tvshowcontainer = seasonIndex.find('div',id=re.compile('scet_header|scet_top|show-header|show-header-scet|^header$')) if tvshowcontainer==None: tvshowcontainer=seasonIndex.find('div',{'class':re.compile('scet_header|scet_top|show-header|show-header-scet')}) if tvshowcontainer!=None: tvshowtitle = tvshowcontainer.find('h1').string else: tvshowtitle = re.search('var siteName = "(.+?)";',source).group(1) print 'Parsing seasons for "%s"' % tvshowtitle showsListing = seasonIndex.find('div',{"class":re.compile('scet-gallery-nav')}).find('h3',text='Full Episodes').parent.findNextSibling('ul').findAll('li') for show in showsListing: showLink = show.find('a') print 'Found '+showLink.string listitem=xbmcgui.ListItem(decode_htmlentities(showLink.string)) listitem.setInfo('video',{'tvshowtitle':tvshowtitle}) #listitem.setThumbnailImage(showLink.find('img')['src']) if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL,'') xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?seasonUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl),),totalItems=len(showsListing),isFolder=True) xbmcplugin.setContent( handle=int(sys.argv[1]), content='seasons') xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )
def DoStep3(self, allofit): scrape_result = "good" page = self.HexToByte(allofit['body']) #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("fd-summary.html", page) accountTable = soup.find('table', attrs={'class': 'fdBalancesTable'}) if accountTable != None: self.accountLinks = accountTable.findAll( 'a', attrs={'class': 'fdActionLink'}) if len(self.accountLinks) == 0: #got some kind of message scrape_result = 'bank error' logging.info('Still got no accounts') else: logging.debug("No fd table") scrape_result = 'credentials incorrect' return scrape_result
def doStep7(self, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("natwest-acclink.html", page) loginform = soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('NatWest no view account form') return 'bank error' action = self.urlBase + '/' + loginform['action'] values = self.parseForm(loginform) # fill in our selection - 1 month values['ctl00$mainContent$SS2SPDDA'] = 'M1' # default button - needed values['ctl00$mainContent$NextButton_button'] = 'View Transactions' # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 20 return 'good'
def doStep2(self, allofit, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("natwest-username.html", page) loginform = soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('NatWest no login form') return 'bank error' action = self.urlBase + '/' + loginform['action'] values = self.parseForm(loginform) # fill in our credentials values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds[ '01'] #customer number # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 3 return 'good'
def doAllLink(self, page): soup = BeautifulSoup(page) self.output_page("natwest-xactlist-all-look.html", page) #<a href="/StatementsFixedPeriod.aspx?id=B7879D8CABBF283B38AE447E07A4EA5D8DA9A859&persist=%2fwEPBQ1BY2NvdW50TnVtYmVyBQg4ODU3MjIxOA%3d%3d%7c%2fwEPBQhGcm9tRGF0ZQUTMTgvMDUvMjAxMSAwMDowMDowMA%3d%3d%7c%2fwEPBQhTb3J0Q29kZQUGNjAxNzIx%7c%2fwEPBQZUb0RhdGUFEzE4LzA3LzIwMTEgMDA6MDA6MDA%3d%7c%2fwEWBh4JU1MyQUNDRERBDwUUU0lOSEEgTS9TVFUgODg1NzIyMTgFKEI3ODc5RDhDQUJCRjI4M0IzOEFFNDQ3RTA3QTRFQTVEOERBOUE4NTkeCFNTMlNQRERBDxBkZBYBAgNnHgZTUzJXTEEPAgFo&showall=1" title="Show all items on a single page">All</a> logging.debug('NatWest checking for all links') # find any all link links = soup.findAll('a') link = None for a in links: # detect our link try: if re.search(".tatements.ixed.eriod", a['href']): logging.debug("natwest - got a statement link") if re.search(".ll", a.text): # the one that says all link = self.composeLink(a['href'][:]) logging.debug("natwest - got an All statement link") break # only need the first one so break the for loop except Exception, e: logging.debug('NatWest a link error missing href - ' + str(e))
def doStep2(self, allofit, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("RBS-username.html", page) loginform=soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('RBS no login form') return 'bank error' action = self.urlBase + '/' + loginform['action'] values = self.parseForm(loginform) # fill in our credentials values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds['01'] #customer number # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 3 return 'good'
def __init__( self ): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl'])) showIndex = BeautifulSoup(source) vidInfo = {'tvshowtitle': showIndex.find('div',id='showDashboard').find('span',{'class':'blueText'}).string, 'studio': 'FOX'} seasonsListing = showIndex.findAll('div',{'class':re.compile('dashPageHolder'),'id':re.compile('^fullEp')}) print len(seasonsListing) for season in seasonsListing: episodesListing = season.findAll('div',{'class':'episodeListing'}) for episode in episodesListing: listitem = xbmcgui.ListItem(episode.find('h3').find('a').string) listitem.setThumbnailImage(episode.find('img',id=re.compile('^epThumb'))['src']) episodeLink = episode.find('a',{'class':'thumbnailLink'}) if episodeLink['href'][0] == '/': episodeUrl = episodeLink['href'][1:] else: episodeUrl = episodeLink['href'].replace(self.BASE_URL,'') airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'}))) seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'}))) episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'}))) vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2)) vidInfo['season'] = int(seasonNum.group(1)) vidInfo['episode'] = int(episodeNumAndDuration.group(1)) vidInfo['duration'] = episodeNumAndDuration.group(2) vidInfo['title'] = episode.find('h3').find('a').string vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4)) print vidInfo listitem.setInfo("video",vidInfo) xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?episodeUrl=%s" % ( sys.argv[ 0 ], quote_plus(episodeUrl))) xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )
def firstPass(self, page): soup = BeautifulSoup(page) loginform = soup.find('form') action = loginform['action'] urls = urlparse(action) self.urlBase = "https://" + urls.netloc logging.info("Base URL = " + self.urlBase) inputs = loginform.findAllNext('input') values = {} values['userid'] = self.filledCreds['03'] #username # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 2
def DoStep3(self, allofit): scrape_result = "good" page = self.HexToByte( allofit['body']) #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("fd-summary.html", page) accountTable=soup.find('table', attrs={'class' : 'fdBalancesTable'}) if accountTable != None: self.accountLinks=accountTable.findAll('a', attrs={'class' : 'fdActionLink'}) if len(self.accountLinks) == 0: #got some kind of message scrape_result = 'bank error' logging.info('Still got no accounts') else: logging.debug("No fd table"); scrape_result = 'credentials incorrect' return scrape_result
def __init__(self): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl'])) showIndex = BeautifulSoup(source) vidInfo = { 'tvshowtitle': showIndex.find('div', id='showDashboard').find('span', { 'class': 'blueText' }).string, 'studio': 'FOX' } seasonsListing = showIndex.findAll( 'div', { 'class': re.compile('dashPageHolder'), 'id': re.compile('^fullEp') }) print len(seasonsListing) for season in seasonsListing: episodesListing = season.findAll('div', {'class': 'episodeListing'}) for episode in episodesListing: listitem = xbmcgui.ListItem( episode.find('h3').find('a').string) listitem.setThumbnailImage( episode.find('img', id=re.compile('^epThumb'))['src']) episodeLink = episode.find('a', {'class': 'thumbnailLink'}) if episodeLink['href'][0] == '/': episodeUrl = episodeLink['href'][1:] else: episodeUrl = episodeLink['href'].replace(self.BASE_URL, '') airedDateAndPlot = re.search( 'Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$', str(episode.find('div', {'class': 'episodeInfo'}))) seasonNum = re.search( 'Season\s+([0-9]+?)[\s:]', str(episode.find('p', {'class': 'seasonNum'}))) episodeNumAndDuration = re.search( 'Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)', str(episode.find('p', {'class': 'episodeNumLine'}))) vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3), airedDateAndPlot.group(1), airedDateAndPlot.group(2)) vidInfo['season'] = int(seasonNum.group(1)) vidInfo['episode'] = int(episodeNumAndDuration.group(1)) vidInfo['duration'] = episodeNumAndDuration.group(2) vidInfo['title'] = episode.find('h3').find('a').string vidInfo['plot'] = decode_htmlentities( airedDateAndPlot.group(4)) print vidInfo listitem.setInfo("video", vidInfo) xbmcplugin.addDirectoryItem( handle=int(sys.argv[1]), listitem=listitem, url="%s?episodeUrl=%s" % (sys.argv[0], quote_plus(episodeUrl))) xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)
def rewrite_html(self, guid, html=None, ajax_url=None): """if we are not using ajax, then html is IGNORED and we go by the cached copy. html is sometimes used to see if there should be a cached copy at all, or if something goes wrong and we just need to return unaltered html """ guid = str(guid) cache_dir = os.path.join(self._store_location, guid_hash(guid)) mapping_file = os.path.join(cache_dir, guid + "-" + "mapping.pickle") if not os.path.isfile(mapping_file): # quick and dirty check. are there images? if not, plain # html is fine if html.lower().find('<img') >= 0: #logging.warning("Should be downloaded images, but couldn't open mapping. Recaching") self.cache_html(guid, html) return html try: mapping = open(mapping_file, 'r') rewrite_hash = pickle.load(mapping) non_ajax_html = pickle.load(mapping) mapping.close() except: logging.error("error opening cache pickle for guid %s %s" % (guid, mapping_file)) logging.error( "If you have upgraded penguintv, you might need to delete your image cache" ) return html if ajax_url is None: return non_ajax_html #else, rewrite on the fly soup = BeautifulSoup(html) img_tags = soup.findAll('img') if len(img_tags) == 0: return html for result in img_tags: # believe it or not, some img tags don't have a src, they have an id # that points to CSS. At least I think that's what's going on if result.has_key('src'): if rewrite_hash.has_key(result['src']): if rewrite_hash[result['src']][1] == UrlCacher.DOWNLOADED: #if os.path.isfile(os.path.join(self._store_location, rewrite_hash[result['src']][0])): result['src'] = ajax_url + "/cache/" + rewrite_hash[ result['src']][0] #else: # logging.warning("file not found, not replacing") # logging.debug("(should we attempt to recache here?") return soup.prettify()
def _parseNatWestLinks(self, raw): soup = BeautifulSoup(raw) accountBLock = soup.findAll('a', attrs={'class': 'accountNameExpand'}) # got some acount details now so all good if len(accountBLock) == 0: logging.warning('NatWest no accounts after continue form') return 'account problem' for ac_link in accountBLock: ac_link.string = ac_link.text self.accountLinks.append(ac_link) # now the accnum list - to get the pair data, cos cant get it from link row = ac_link.parent.parent try: # find the account number span acnumSpan = row.find('span', attrs={'class': 'AccountNumber'}) acnum = acnumSpan.text acnum = acnum.replace(' ', '') # find the sort code span sortSpan = row.find('span', attrs={'class': 'SortCode'}) sortc = sortSpan.text sortc = sortc.replace(' ', '') sortc = sortc.replace('-', '') except Exception, e: logging.exception('NatWest form error - ' + str(e)) return 'bank error' #combine the two - to be our matching number num = sortc + "-" + acnum actype = 'Cheque' # might be a credit card if len(acnum) > 14: actype = 'Credit' # now get balances... balance = 0 baltr = ac_link.parent.parent baltds = baltr.findAll('td') if len(baltds) > 2: baltext = self.tidy_text(baltds[3].text) balance = self.normalise_ammount(baltext) # and add it to our account list acpair = { 'name': ac_link.text, 'num': num, 'type': actype, 'bal': balance } self.myAccounts.append(acpair)
def doStep3(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("natwest-security.html", page) scrape_result = 'good' logging.info("NatWest security page2") # check if we got returned # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on errorDiv = soup.findAll( 'input', attrs={'name': 'ctl00$mainContent$LI5TABA$DBID_edit'}) if len(errorDiv) != 0: logging.info("NatWest security page1 still - customer number bad") return 'credentials incorrect' # if we get here then the form was found hence creds must be wrong # find our form loginform = soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('NatWest no security form') return 'bank error' values = self.parseForm(loginform) # define some variables that would only otherwise exist in a try catch block scope # the label text split on spaces which1arr = "" which2arr = "" which3arr = "" # the chalenges firstDigit = "" secondDigit = "" thirdDigit = "" #>>>>>>> The first set of Pin fields #-------------------- get the questions --------------# #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label> useNewTab = False try: which1 = soup.find('label', attrs={ 'for': 'ctl00_mainContent_LI6PPEA_edit' }).text except Exception, e: useNewTab = True
def __init__( self ): source = self._fetch_url(self.BASE_FOD_URL) fodIndex = BeautifulSoup(source) showsListing = fodIndex.find('div',id='episodes-listing').findAll('li') for show in showsListing: showLink = show.find('a') if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL,'') xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=xbmcgui.ListItem(showLink.string),url="%s?showUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl)),totalItems=len(showsListing),isFolder=True) xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )
def rewrite_html(self, guid, html=None, ajax_url=None): """if we are not using ajax, then html is IGNORED and we go by the cached copy. html is sometimes used to see if there should be a cached copy at all, or if something goes wrong and we just need to return unaltered html """ guid = str(guid) cache_dir = os.path.join(self._store_location, guid_hash(guid)) mapping_file = os.path.join(cache_dir, guid + "-" + "mapping.pickle") if not os.path.isfile(mapping_file): # quick and dirty check. are there images? if not, plain # html is fine if html.lower().find('<img') >= 0: #logging.warning("Should be downloaded images, but couldn't open mapping. Recaching") self.cache_html(guid, html) return html try: mapping = open(mapping_file, 'r') rewrite_hash = pickle.load(mapping) non_ajax_html = pickle.load(mapping) mapping.close() except: logging.error("error opening cache pickle for guid %s %s" % (guid, mapping_file)) logging.error("If you have upgraded penguintv, you might need to delete your image cache") return html if ajax_url is None: return non_ajax_html #else, rewrite on the fly soup = BeautifulSoup(html) img_tags = soup.findAll('img') if len(img_tags) == 0: return html for result in img_tags: # believe it or not, some img tags don't have a src, they have an id # that points to CSS. At least I think that's what's going on if result.has_key('src'): if rewrite_hash.has_key(result['src']): if rewrite_hash[result['src']][1] == UrlCacher.DOWNLOADED: #if os.path.isfile(os.path.join(self._store_location, rewrite_hash[result['src']][0])): result['src'] = ajax_url + "/cache/" + rewrite_hash[result['src']][0] #else: # logging.warning("file not found, not replacing") # logging.debug("(should we attempt to recache here?") return soup.prettify()
def _parseRBSLinks(self, raw): soup = BeautifulSoup(raw) accountBLock=soup.findAll('a', attrs={'class' : 'accountNameExpand'}) # got some acount details now so all good if len(accountBLock) == 0: logging.warning('RBS no accounts after continue form') return 'account problem' for ac_link in accountBLock: ac_link.string = ac_link.text self.accountLinks.append(ac_link) # now the accnum list - to get the pair data, cos cant get it from link row = ac_link.parent.parent try: # find the account number span acnumSpan = row.find('span', attrs={'class': 'AccountNumber'}) acnum = acnumSpan.text acnum = acnum.replace(' ', '') # find the sort code span sortSpan = row.find('span', attrs={'class': 'SortCode'}) sortc = sortSpan.text sortc = sortc.replace(' ', '') sortc = sortc.replace('-', '') except Exception, e: logging.exception('RBS form error - ' + str(e)) return 'bank error' #combine the two - to be our matching number num = sortc + "-" + acnum actype = 'Cheque' # might be a credit card if len(acnum) > 14: actype = 'Credit' # now get balances... balance = 0 baltr = ac_link.parent.parent baltds = baltr.findAll('td') if len(baltds) > 2: baltext = self.tidy_text(baltds[3].text) balance = self.normalise_ammount(baltext) # and add it to our account list acpair = {'name': ac_link.text, 'num': num, 'type': actype, 'bal': balance} self.myAccounts.append(acpair)
def doStep3(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("RBS-security.html", page) scrape_result = 'good' logging.info("RBS security page2") # check if we got returned # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI5TABA$DBID_edit'}) if len(errorDiv) != 0: logging.info("RBS security page1 still - customer number bad") return 'credentials incorrect' # if we get here then the form was found hence creds must be wrong # find our form loginform=soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('RBS no security form') return 'bank error' values = self.parseForm(loginform) # define some variables that would only otherwise exist in a try catch block scope # the label text split on spaces which1arr = "" which2arr = "" which3arr = "" # the chalenges firstDigit = "" secondDigit = "" thirdDigit = "" #>>>>>>> The first set of Pin fields #-------------------- get the questions --------------# #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label> useNewTab = False try: which1=soup.find('label', attrs={'for' : 'ctl00_mainContent_LI6PPEA_edit'}).text except Exception, e: useNewTab = True
def get(self): bookname = self.request.get('search') req = isbndbpy.Request('books', 'combined', str(bookname)) resp = req.send().read() #print resp soup = BS(str(resp)) books = soup.findAll('bookdata') self.response.out.write('<html><body>') for bookdata in books: self.response.out.write('<br/>Title: '+ str(bookdata.find('title').string )) self.response.out.write('<br/>ISBN: ' +str(bookdata.get('isbn13'))) self.response.out.write('<br/>AUTHOR :'+str( bookdata.find('authorstext').string)) self.response.out.write('<br/>PUBLISEHR: '+str(bookdata.find('publishertext').string)) self.response.out.write('<br/> "***********')
def _parseNatWestLinks(self, raw): soup = BeautifulSoup(raw) accountBLock = soup.findAll("a", attrs={"class": "accountNameExpand"}) # got some acount details now so all good if len(accountBLock) == 0: logging.warning("NatWest no accounts after continue form") return "account problem" for ac_link in accountBLock: ac_link.string = ac_link.text self.accountLinks.append(ac_link) # now the accnum list - to get the pair data, cos cant get it from link row = ac_link.parent.parent try: # find the account number span acnumSpan = row.find("span", attrs={"class": "AccountNumber"}) acnum = acnumSpan.text acnum = acnum.replace(" ", "") # find the sort code span sortSpan = row.find("span", attrs={"class": "SortCode"}) sortc = sortSpan.text sortc = sortc.replace(" ", "") sortc = sortc.replace("-", "") except Exception, e: logging.exception("NatWest form error - " + str(e)) return "bank error" # combine the two - to be our matching number num = sortc + "-" + acnum actype = "Cheque" # might be a credit card if len(acnum) > 14: actype = "Credit" # now get balances... balance = 0 baltr = ac_link.parent.parent baltds = baltr.findAll("td") if len(baltds) > 2: baltext = self.tidy_text(baltds[3].text) balance = self.normalise_ammount(baltext) # and add it to our account list acpair = {"name": ac_link.text, "num": num, "type": actype, "bal": balance} self.myAccounts.append(acpair)
def __init__(self): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl'])) print 'Loading ' + self.BASE_URL + unquote_plus(params['showUrl']) seasonIndex = BeautifulSoup(source) tvshowcontainer = seasonIndex.find( 'div', id=re.compile( 'scet_header|scet_top|show-header|show-header-scet|^header$')) if tvshowcontainer == None: tvshowcontainer = seasonIndex.find( 'div', { 'class': re.compile( 'scet_header|scet_top|show-header|show-header-scet') }) if tvshowcontainer != None: tvshowtitle = tvshowcontainer.find('h1').string else: tvshowtitle = re.search('var siteName = "(.+?)";', source).group(1) print 'Parsing seasons for "%s"' % tvshowtitle showsListing = seasonIndex.find('div', { "class": re.compile('scet-gallery-nav') }).find( 'h3', text='Full Episodes').parent.findNextSibling('ul').findAll('li') for show in showsListing: showLink = show.find('a') print 'Found ' + showLink.string listitem = xbmcgui.ListItem(decode_htmlentities(showLink.string)) listitem.setInfo('video', {'tvshowtitle': tvshowtitle}) #listitem.setThumbnailImage(showLink.find('img')['src']) if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL, '') xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), listitem=listitem, url="%s?seasonUrl=%s" % ( sys.argv[0], quote_plus(showUrl), ), totalItems=len(showsListing), isFolder=True) xbmcplugin.setContent(handle=int(sys.argv[1]), content='seasons') xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)
def DoStep2(self, allofit): page = self.HexToByte(allofit['body']) #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("fd-username.html", page) loginform = soup.find('form') action = loginform['action'] inputs = loginform.findAllNext('input') values = {} self.response = {} # build the post values up - there arent any others afaik ps = loginform.findAllNext('p') numbers = ps[1].findAllNext('strong') #not enough lookup digits try: password = self.lookupdigit(numbers[0].text) + self.lookupdigit( numbers[1].text) + self.lookupdigit(numbers[2].text) except: logging.debug("credentials incorrect") return 'credentials incorrect' answer = self.filledCreds['06'] values['password'] = password values['memorableAnswer'] = answer # build the body content data = urllib.urlencode(values) self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 3 return 'good'
def __init__(self, prontuario_html): ''' Constructor ''' self.__soup = \ BeautifulSoup(prontuario_html, fromEncoding='iso-8859-1', convertEntities=BeautifulSoup.HTML_ENTITIES) self.__prontuario = {} self._parsear_dados_veiculo() self._parsear_debitos() self._parsear_infracoes_em_autuacao() self._parsear_listagem_multas() self._parsear_historico_multas() self._parsear_ultimo_processo() self._parsear_recurso_infracao()
def DoStep2(self, allofit): page = self.HexToByte( allofit['body']) #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("fd-username.html", page) loginform=soup.find('form') action = loginform['action'] inputs = loginform.findAllNext('input') values = {} self.response = {} # build the post values up - there arent any others afaik ps = loginform.findAllNext('p') numbers = ps[1].findAllNext('strong') #not enough lookup digits try: password = self.lookupdigit(numbers[0].text) + self.lookupdigit(numbers[1].text) + self.lookupdigit(numbers[2].text) except: logging.debug("credentials incorrect") return 'credentials incorrect' answer = self.filledCreds['06'] values['password'] = password values['memorableAnswer'] = answer # build the body content data = urllib.urlencode(values) self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 3 return 'good'
def _get_soup(self, html): try: return BeautifulSoup(html) except: logging.warning( "BeautifulSoup exception cleaning up html, can't cache images offline" ) return None
def _parseComment(self, communityId, liveInfoFilePath, commentFilePath): chatList = [] if not (os.path.exists(liveInfoFilePath) and os.path.exists(commentFilePath)): return chatList infoParser = BeautifulSoup(open(liveInfoFilePath, u'r')) if not infoParser.find(u'communityid').renderContents() == communityId: return chatList commentParser = BeautifulSoup(open(commentFilePath, u'r')) chatTagList = commentParser.findAll(u'chat', attrs={u'msgkind': u'message_msg'}) for chatTag in chatTagList: communityId = communityId.decode(u'utf-8') liveId = infoParser.find(u'liveid').renderContents().decode() userId = chatTag.get(u'user').decode(u'utf-8') name = chatTag.get(u'nickname').decode(u'utf-8') message = chatTag.renderContents().decode(u'utf-8') option = chatTag.get(u'mail').decode(u'utf-8') if chatTag.get(u'mail') != '' else None date = re.sub( ur'(\d{4})/(\d{1,2})/(\d{1,2})\s(\d{1,2}):(\d{1,2}):(\d{1,2})', lambda match: u'{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}'.format(int(match.group(1)), int(match.group(2)), int(match.group(3)), int(match.group(4)), int(match.group(5)), int(match.group(6))), chatTag.get(u'date') ).decode(u'utf-8') chatList.append((communityId, liveId, userId, name, message, option, date)) return chatList
def __init__(self): source = self._fetch_url(self.BASE_FOD_URL) fodIndex = BeautifulSoup(source) showsListing = fodIndex.find('div', id='episodes-listing').findAll('li') for show in showsListing: showLink = show.find('a') if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL, '') xbmcplugin.addDirectoryItem( handle=int(sys.argv[1]), listitem=xbmcgui.ListItem(showLink.string), url="%s?showUrl=%s" % (sys.argv[0], quote_plus(showUrl)), totalItems=len(showsListing), isFolder=True) xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)
def doStep12(self, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("RBS-xactlist-cc-poss.html", page) rightButtons=soup.findAll('a', attrs={'class' : 'link-button-right'}) # any buttons? if len(rightButtons) == 0: logging.error('RBS no cc accountbuttons') return 'bank error' # RBS is not dynamic -so this static list is fine (unlike Smile) acLink = None for a in rightButtons: # filter out the account detail buttons matching just the statement buttons # Bloody hope this regex finds shit in the right order if re.search(".ard.tatement.etail", a['href']): acLink = a['href'][:] if acLink == None: logging.debug('RBS no cc detail link') return 'bank error' # action = self.urlBase + '/' + loginform['action'] action = acLink try: logging.debug("checking link - " + acLink) urls = urlparse(acLink); # if it parses properly good else except Exception, e: logging.error('RBS cc link error - ' + str(e)) action = self.urlBase + '/' + acLink
def __init__( self ): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['seasonUrl'])) showIndex = BeautifulSoup(source) tvshowcontainer = showIndex.find('div',id=re.compile('scet_header|scet_top|show-header|show-header-scet|^header$')) if tvshowcontainer==None: tvshowcontainer=showIndex.find('div',{'class':re.compile('scet_header|scet_top|show-header|show-header-scet')}) if tvshowcontainer!=None: tvshowtitle = tvshowcontainer.find('h1').string else: tvshowtitle = re.search('var siteName = "(.+?)";',source).group(1) pages = 1 if showIndex.find('div',{'class':re.compile('nbcu_pager')}): pageLinks = showIndex.find('div',{'class':re.compile('nbcu_pager')}).findAll('a',{'class':re.compile('nbcu_pager_page')}) pages = len(pageLinks) for i in range(0,pages): if i>0: source = self._fetch_url(self.BASE_URL + pageLinks[i]['href']) showIndex = BeautifulSoup(source) episodesListing = showIndex.find('ul',{'class':re.compile('scet_th_full')}).findAll('li') for episode in episodesListing: vidInfo = {'tvshowtitle': tvshowtitle, 'studio': 'NBC'} title = decode_htmlentities(episode.find('p',{'class':re.compile('list_full_det_title')}).find('a').string) listitem = xbmcgui.ListItem(title) listitem.setThumbnailImage(episode.find('img')['src']) episodeLink = episode.find('a') if episodeLink['href'][0] == '/': episodeUrl = episodeLink['href'][1:] else: episodeUrl = episodeLink['href'].replace(self.BASE_URL,'') if episode.find('p',{'class':re.compile('list_full_des')}): vidInfo['plot'] = decode_htmlentities(episode.find('p',{'class':re.compile('list_full_des')}).find('em').string) epNum = re.search('^Ep(?:\.\s*)?([0-9]{1,2})([0-9][0-9])(?:\s*:\s*)?(.+)$',title) if epNum != None: vidInfo['season'] = int(epNum.group(1)) vidInfo['episode'] = int(epNum.group(2)) vidInfo['title'] = epNum.group(3) #airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'}))) #seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'}))) #episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'}))) #vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2)) #vidInfo['season'] = int(seasonNum.group(1)) #vidInfo['episode'] = int(episodeNumAndDuration.group(1)) #vidInfo['duration'] = episodeNumAndDuration.group(2) #vidInfo['title'] = episode.find('h3').find('a').string #vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4)) #print vidInfo listitem.setInfo("video",vidInfo) xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?episodeUrl=%s&episode=%s&season=%s" % ( sys.argv[ 0 ], quote_plus(episodeUrl),vidInfo['episode'],vidInfo['season'])) xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_EPISODE ) xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_DATE ) xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_LABEL ) xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_DURATION ) xbmcplugin.setContent( handle=int(sys.argv[1]), content='episodes') xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )
def doStep1(self, allofit, page): body = page scrape_result = 'good' logging.info("RBS page1") # the following is how you could retrieve the headers from the request # for head in allofit['headers']: # name = self.HexToByte(head['name']) # val = self.HexToByte(head['value']) # write out the start page self.output_page("1_first_page.html", body) soup = BeautifulSoup(body); frame = soup.find('frame', id='ctl00_secframe'); if frame != None: action = self.urlBase + '/' + frame['src']; #<frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&cookieid=100714"></frame> self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = "" self.response['method'] = 'GET' self.response['step'] = 2 else: logging.debug('RBS frame link error - ') scrape_result = 'bank error' return scrape_result
def _loadComment(self, communityId, userSettingFilePath, commentLogFolder): nameDict = self._loadUserSetting(communityId, userSettingFilePath) commentLogFileList = filter(lambda file: re.match(ur'ncvLog_lv\d+-{0}\.xml$'.format(communityId), file) , os.listdir(commentLogFolder)) chatList = [] for commentFile in commentLogFileList: parser = BeautifulSoup(open(os.path.join(commentLogFolder, commentFile), u'r')) liveId = u'lv' + parser.find(u'livenum').renderContents().decode(u'utf-8') chatTagList = parser.find(u'livecommentdataarray').findAll(u'chat', recursive=False) for chatTag in chatTagList: userId = chatTag.get(u'user_id') if chatTag.get(u'user_id') == u'': continue name = nameDict.get(userId) message = chatTag.renderContents().decode(u'utf-8') option = chatTag.get(u'mail') unixtime = time.localtime(int(chatTag.get(u'date'))) date = (datetime.datetime(*unixtime[:-3]).strftime(u'%Y-%m-%d %H:%M:%S') if unixtime else None).decode(u'utf-8') chatList.append((communityId, liveId, userId, name, message, option, date)) return chatList
def doStep12(self, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("natwest-xactlist-cc-poss.html", page) rightButtons = soup.findAll('a', attrs={'class': 'link-button-right'}) # any buttons? if len(rightButtons) == 0: logging.error('NatWest no cc accountbuttons') return 'bank error' # natWest is not dynamic -so this static list is fine (unlike Smile) acLink = None for a in rightButtons: # filter out the account detail buttons matching just the statement buttons # Bloody hope this regex finds shit in the right order if re.search(".ard.tatement.etail", a['href']): acLink = a['href'][:] if acLink == None: logging.debug('NatWest no cc detail link') return 'bank error' # action = self.urlBase + '/' + loginform['action'] action = acLink try: logging.debug("checking link - " + acLink) urls = urlparse(acLink) # if it parses properly good else except Exception, e: logging.error('NatWest cc link error - ' + str(e)) action = self.urlBase + '/' + acLink
def __makerequest(self, cmd, **kwargs): kwargs["cmd"] = cmd if self._token: kwargs["token"] = self._token try: response = BeautifulSoup( self._opener.open(self._url + urllib.urlencode( dict([ k, v.encode('utf-8') if isinstance(v, basestring) else v ] for k, v in kwargs.items())))).response except urllib2.URLError, e: raise FogBugzConnectionError(e)
def read_timetable_file(self, route_filename, weekday, direction): # Example of how the timetables can be read and returned as a Map from BeautifulSoup.BeautifulSoup import BeautifulSoup import urllib2, re, time filestr = ('data/timetables/%s_%s_%s.html' % (route_filename, weekday, direction)) fil = open(filestr, "r") soup = BeautifulSoup(fil.read(), fromEncoding='utf8') fil.close() divs = soup.html.body.findAll('div') children = divs[0].contents #timetable tt = children[1].contents[3].contents[3].contents[3].contents[ 1].contents[2] route_list = [] route_times_list = [] # stop names values for (j, name) in enumerate(tt.contents[0].contents[4].contents): route_times_list = [] route_name = name.contents[1].find('a').contents[0] print route_name #am / pm values for (i, name) in enumerate( tt.contents[0].contents[3].contents[2].contents): time_value = tt.contents[0].contents[5].contents[ j + 1].contents[i].contents[0].text if time_value == '-': print time_value continue time_prefix = name.text #values minus the first time_str = '' + time_value + ' ' + time_prefix try: time_result = time.strftime( '%H:%M:%S', time.strptime(time_str, '%I:%M %p')) route_times_list.append(time_result) print time_result except: print "ERR", time_str route_list.append((route_name, route_times_list)) return route_list
def __init__( self ): print 'Fetching %s' % self.INDEX_URL source = self._fetch_url(self.INDEX_URL) fodIndex = BeautifulSoup(source) showsListing = fodIndex.find('div',{"class":re.compile('group-full-eps')}).findAll('li') print 'Parsed listing and found %d shows' % len(showsListing) for show in showsListing: showLink = show.find('a') listitem=xbmcgui.ListItem(decode_htmlentities(showLink['title'])) episodeCount = show.find('div',text=re.compile('^[0-9]+ Videos?$')) if episodeCount: episodeCount = int(re.search('^([0-9]+)\s*Videos?$',episodeCount.string).group(1)) print 'Found "%s" with %d episodes' % (decode_htmlentities(showLink['title']),episodeCount) listitem.setInfo('video',{'episode':episodeCount}) else: print 'Found "%s" but did not find how many episodes' % decode_htmlentities(showLink['title']) listitem.setThumbnailImage(showLink.find('img')['src']) if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL,'') xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?showUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl)),totalItems=len(showsListing),isFolder=True) xbmcplugin.setContent(handle=int(sys.argv[1]), content='tvshows') xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )
def __init__(self): print 'Fetching %s' % self.INDEX_URL source = self._fetch_url(self.INDEX_URL) fodIndex = BeautifulSoup(source) showsListing = fodIndex.find('div', { "class": re.compile('group-full-eps') }).findAll('li') print 'Parsed listing and found %d shows' % len(showsListing) for show in showsListing: showLink = show.find('a') listitem = xbmcgui.ListItem(decode_htmlentities(showLink['title'])) episodeCount = show.find('div', text=re.compile('^[0-9]+ Videos?$')) if episodeCount: episodeCount = int( re.search('^([0-9]+)\s*Videos?$', episodeCount.string).group(1)) print 'Found "%s" with %d episodes' % (decode_htmlentities( showLink['title']), episodeCount) listitem.setInfo('video', {'episode': episodeCount}) else: print 'Found "%s" but did not find how many episodes' % decode_htmlentities( showLink['title']) listitem.setThumbnailImage(showLink.find('img')['src']) if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL, '') xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), listitem=listitem, url="%s?showUrl=%s" % (sys.argv[0], quote_plus(showUrl)), totalItems=len(showsListing), isFolder=True) xbmcplugin.setContent(handle=int(sys.argv[1]), content='tvshows') xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)
def __init__(self, url, token=None): self.__handlerCache = {} if not url.endswith('/'): url += '/' if token: self._token = token.encode('utf-8') else: self_token = None self._opener = urllib2.build_opener() try: soup = BeautifulSoup(self._opener.open(url + 'api.xml')) except URLError: raise FogBugzConnectionError( "Library could not connect to the FogBugz API. Either this installation of FogBugz does not support the API, or the url, %s, is incorrect." % (self._url, )) self._url = url + soup.response.url.string self.currentFilter = None
def _loadUserSetting(self, communityId, userSettingFilePath): parser = BeautifulSoup(open(userSettingFilePath, u'r')) nameTagList = parser.findAll(u'user', attrs={ u'community': communityId, u'name': True }) return dict(map(lambda tag: (tag.renderContents(), tag.get(u'name')), nameTagList))
def doStep4(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("RBS-pos-accounts.html", page) scrape_result = 'good' logging.info("RBS message or bad cred check ") # if we still have the input then def bad credentials errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI6PPEA_edit'}) if len(errorDiv) != 0: logging.info("RBS defiantely bad credentials") return 'credentials incorrect' accountBLock=soup.findAll('table', attrs={'class' : 'AccountTable'}) # got some acount details so all good if len(accountBLock) > 0: logging.debug("RBS defiantely got some good accounts") return 'good'; # find any link # if we find a link return it # check for the normal continue button and fail all else - with credentials failure continueButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_FinishButton_button'}) if(continueButton == None): logging.warning("RBS cant find finish button credentials incorrect") nextButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_NextButton_button'}) if(nextButton == None): logging.warning("RBS cant find next button either") return 'credentials incorrect' # now find the form that these buttons belong to loginform=soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('RBS no continue form') return 'bank error' action = self.urlBase + '/' + loginform['action'] # any hidden values etc values = self.parseForm(loginform) # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 4 return 'messages'
def _processCCAccount(self, raw, account_path, balance): soup = BeautifulSoup(raw) logging.debug('CC ac path - ' + str(account_path) + ' - end') try: if account_path != "": # delete existing current xactions logging.debug('Processing :) ') builder = StatementBuilder(self.facade, account_path, self.token) self.statementlist.append(builder) self.statementbuilder = self.statementlist[ self.current_statement] # we know this is not a credit card isCCard = True # get a fixed balance somewhere?? # passed in for natwest # set up our statement self.statementbuilder.make_recent_dif_statement( 'NatWest-recent', 'Scraper', None) # now set the final balance logging.debug("Balance - - - - - - - > " + str(balance)) self.statementbuilder.set_current_balance(balance) # now find all the recent transactions x_table = soup.find('table', attrs={'class': 'ItemTable'}) if x_table != None: x_body = x_table.find('tbody') inputs = x_body.findAll('tr') # build the post values up for rows in inputs: atts = {} vals = rows.findAll('td') if vals: datebit = '' for i, val in enumerate(vals): data = self.tidy_text(val.text) if i == 0: dp = DateParser() try: atts['date'] = dp.ymd_from_date( dp.date_from_small(data)) except: atts['date'] == '' if i == 1: datebit = data[:-5] if i == 2: if data != 'SALE': # only keep extra xact date for Sales datebit = '' if i == 3: if data != "": atts['display'] = " ".join( data.split()).encode('utf8') atts['extradisplay'] = datebit.encode( 'utf8') if i > 3: # the numbers if data != "" and data != '-': amount = self.normalise_ammount(data) if i == 4: atts['amount'] = amount atts['type'] = 'Credit' if i == 5: atts['amount'] = amount atts['type'] = 'Debit' if i == 5: self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1 except Exception, e: logging.exception('NatWest parsing error - ' + str(e))
def doStep4(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("natwest-pos-accounts.html", page) scrape_result = 'good' logging.info("NatWest message or bad cred check ") # if we still have the input then def bad credentials errorDiv = soup.findAll( 'input', attrs={'name': 'ctl00$mainContent$LI6PPEA_edit'}) if len(errorDiv) != 0: logging.info("NatWest defiantely bad credentials") return 'credentials incorrect' accountBLock = soup.findAll('table', attrs={'class': 'AccountTable'}) # got some acount details so all good if len(accountBLock) > 0: logging.debug("NatWest defiantely got some good accounts") return 'good' # find any link # if we find a link return it # check for the normal continue button and fail all else - with credentials failure continueButton = soup.find( 'input', attrs={'id': 'ctl00_mainContent_FinishButton_button'}) if (continueButton == None): logging.warning( "NatWest cant find finish button credentials incorrect") nextButton = soup.find( 'input', attrs={'id': 'ctl00_mainContent_NextButton_button'}) if (nextButton == None): logging.warning("NatWest cant find next button either") return 'credentials incorrect' # now find the form that these buttons belong to loginform = soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('NatWest no continue form') return 'bank error' else: logging.debug('found a continue form - so clicking it') action = self.urlBase + '/' + loginform['action'] # any hidden values etc values = self.parseForm(loginform) # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 4 return 'messages'
def __init__(self): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['seasonUrl'])) showIndex = BeautifulSoup(source) tvshowcontainer = showIndex.find( 'div', id=re.compile( 'scet_header|scet_top|show-header|show-header-scet|^header$')) if tvshowcontainer == None: tvshowcontainer = showIndex.find( 'div', { 'class': re.compile( 'scet_header|scet_top|show-header|show-header-scet') }) if tvshowcontainer != None: tvshowtitle = tvshowcontainer.find('h1').string else: tvshowtitle = re.search('var siteName = "(.+?)";', source).group(1) pages = 1 if showIndex.find('div', {'class': re.compile('nbcu_pager')}): pageLinks = showIndex.find('div', { 'class': re.compile('nbcu_pager') }).findAll('a', {'class': re.compile('nbcu_pager_page')}) pages = len(pageLinks) for i in range(0, pages): if i > 0: source = self._fetch_url(self.BASE_URL + pageLinks[i]['href']) showIndex = BeautifulSoup(source) episodesListing = showIndex.find( 'ul', { 'class': re.compile('scet_th_full') }).findAll('li') for episode in episodesListing: vidInfo = {'tvshowtitle': tvshowtitle, 'studio': 'NBC'} title = decode_htmlentities( episode.find('p', { 'class': re.compile('list_full_det_title') }).find('a').string) listitem = xbmcgui.ListItem(title) listitem.setThumbnailImage(episode.find('img')['src']) episodeLink = episode.find('a') if episodeLink['href'][0] == '/': episodeUrl = episodeLink['href'][1:] else: episodeUrl = episodeLink['href'].replace(self.BASE_URL, '') if episode.find('p', {'class': re.compile('list_full_des')}): vidInfo['plot'] = decode_htmlentities( episode.find('p', { 'class': re.compile('list_full_des') }).find('em').string) epNum = re.search( '^Ep(?:\.\s*)?([0-9]{1,2})([0-9][0-9])(?:\s*:\s*)?(.+)$', title) if epNum != None: vidInfo['season'] = int(epNum.group(1)) vidInfo['episode'] = int(epNum.group(2)) vidInfo['title'] = epNum.group(3) #airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'}))) #seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'}))) #episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'}))) #vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2)) #vidInfo['season'] = int(seasonNum.group(1)) #vidInfo['episode'] = int(episodeNumAndDuration.group(1)) #vidInfo['duration'] = episodeNumAndDuration.group(2) #vidInfo['title'] = episode.find('h3').find('a').string #vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4)) #print vidInfo listitem.setInfo("video", vidInfo) xbmcplugin.addDirectoryItem( handle=int(sys.argv[1]), listitem=listitem, url="%s?episodeUrl=%s&episode=%s&season=%s" % (sys.argv[0], quote_plus(episodeUrl), vidInfo['episode'], vidInfo['season'])) xbmcplugin.addSortMethod(handle=int(sys.argv[1]), sortMethod=xbmcplugin.SORT_METHOD_EPISODE) xbmcplugin.addSortMethod(handle=int(sys.argv[1]), sortMethod=xbmcplugin.SORT_METHOD_DATE) xbmcplugin.addSortMethod(handle=int(sys.argv[1]), sortMethod=xbmcplugin.SORT_METHOD_LABEL) xbmcplugin.addSortMethod(handle=int(sys.argv[1]), sortMethod=xbmcplugin.SORT_METHOD_DURATION) xbmcplugin.setContent(handle=int(sys.argv[1]), content='episodes') xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)
class ProntuarioVeiculo(object): ''' classdocs ''' def __init__(self, prontuario_html): ''' Constructor ''' self.__soup = \ BeautifulSoup(prontuario_html, fromEncoding='iso-8859-1', convertEntities=BeautifulSoup.HTML_ENTITIES) self.__prontuario = {} self._parsear_dados_veiculo() self._parsear_debitos() self._parsear_infracoes_em_autuacao() self._parsear_listagem_multas() self._parsear_historico_multas() self._parsear_ultimo_processo() self._parsear_recurso_infracao() def obter_prontuario(self): return deepcopy(self.__prontuario) def _parsear_dados_veiculo(self): tabela = self.__soup.find('div', id='div_servicos_02').table.tbody for celula in tabela.findAll('td'): dado = celula.findAll(text=True) if len(dado) == 2: self.__prontuario[dado[0].strip()] = dado[1].strip() def _parsear_debitos(self): tabela = self.__soup.find('div', id='div_servicos_03').table.tbody debitos = [] for linha in tabela.findAll('tr')[1:-1]: debito = {} texto = linha.td.findAll(text=True) if texto == None: texto = '' debito[u'Classe'] = ''.join(texto).strip() link = '' if linha.td.a != None: link = linha.td.a['href'].strip() debito[u'Link'] = link celulas = [ u'Número DetranNet', u'Vencimento', u'Valor Nominal(R$)', u'Multa(R$)', u'Juros(R$)', u'Valor Atual(R$)' ] for celula, valor in zip(celulas, linha.findAll('td')[1:]): debito[celula] = valor.string.strip() debitos.append(debito) self.__prontuario[u'Débitos'] = debitos def _parsear_infracoes_em_autuacao(self): tabela = self.__soup.find('div', id='div_servicos_10').table.tbody celula_filha = lambda tag: tag.name == 'td' and tag.table == None celulas = tabela.findAll(celula_filha)[3:] infracoes = [] for i in range(len(celulas) / 7): linha = 7 * i infracao = {} infracao[u'Número'] = celulas[linha].a.string infracao[u'Link'] = celulas[linha].a['href'].strip() infracao[u'Valor'] = celulas[linha + 1].string.strip() infracao[u'Situação'] = celulas[linha + 2].string.strip() infracao[u'Descrição 1'] = celulas[linha + 3].string.strip() infracao[u'Descrição 2'] = celulas[linha + 4].string.strip() infracao[u'Local/Complemento 1'] = celulas[linha + 5].string.strip() infracao[u'Local/Complemento 2'] = celulas[linha + 6].string if infracao[u'Local/Complemento 2'] == None: infracao[u'Local/Complemento 2'] = u'' infracao[u'Local/Complemento 2'] = infracao[ u'Local/Complemento 2'].strip() infracoes.append(infracao) self.__prontuario[u'Infrações em Autuação'] = infracoes #TODO: Implementar def _parsear_listagem_multas(self): tabela = self.__soup.find('div', id='div_servicos_04').table.tbody if tabela.tr.td.find(text=re.compile(u'Nenhuma?')): self.__prontuario[u'Listagem de Multas'] = [] return def _parsear_historico_multas(self): tabela = self.__soup.find('div', id='div_servicos_07').table.tbody celula_filha = lambda tag: tag.name == 'td' and tag.table == None celulas = tabela.findAll(celula_filha)[3:] multas = [] for i in range(len(celulas) / 7): linha = 7 * i multa = {} multa[u'Número'] = celulas[linha].a.string multa[u'Link'] = celulas[linha].a['href'].strip() multa[u'Lançamento'] = celulas[linha + 1].string.strip() multa[u'Pagamento'] = celulas[linha + 2].string.strip() multa[u'Descrição 1'] = celulas[linha + 3].string.strip() multa[u'Descrição 2'] = celulas[linha + 4].string.strip() multa[u'Local/Complemento 1'] = celulas[linha + 5].string.strip() multa[u'Local/Complemento 2'] = celulas[linha + 6].string if multa[u'Local/Complemento 2'] == None: multa[u'Local/Complemento 2'] = u'' multa[u'Local/Complemento 2'] = multa[ u'Local/Complemento 2'].strip() multas.append(multa) self.__prontuario[u'Histórico de Multas'] = multas def _parsear_ultimo_processo(self): tabela = self.__soup.find('div', id='div_servicos_11').table.tbody ultimo_processo = {} celulas = tabela.findAll('td') for celula in celulas[:5]: dado = celula.findAll(text=True) ultimo_processo[dado[0]] = dado[1] for i in range(7, len(celulas), 2): chave = celulas[i].findAll(text=True)[0] valor = celulas[i + 1].findAll(text=True)[0] ultimo_processo[chave] = valor self.__prontuario[u'Último Processo'] = ultimo_processo #TODO: Implementar def _parsear_recurso_infracao(self): tabela = self.__soup.find('div', id='div_servicos_09').table.tbody if tabela.tr.td.find(text=re.compile(u'Nenhuma?')): self.__prontuario[u'Recurso de Infração'] = [] return
def _processNormAccount(self, raw, account_path, balance): soup = BeautifulSoup(raw) logging.debug('Norm ac path - ' + str(account_path) + ' - end' ) try: if account_path != "": # delete existing current xactions logging.debug('Processing :) norm ' ) builder = StatementBuilder(self.facade, account_path, self.token) self.statementlist.append(builder) self.statementbuilder = self.statementlist[self.current_statement] # we know this is not a credit card isCCard = False # get a fixed balance somewhere?? # balance passed in for RBS # set up our statement self.statementbuilder.make_recent_dif_statement('RBS-recent', 'Scraper', None) # now set the final balance logging.debug("Balance - - - - - - - > " + str(balance)) self.statementbuilder.set_current_balance(balance) # now find all the recent transactions x_table = soup.find('table', attrs={'class' : 'ItemTable'}) if x_table == None: # could easily be no transactions logging.debug(" No xtable ======>") if x_table != None: x_body = x_table.find('tbody') inputs = x_body.findAll('tr') # build the post values up for rows in inputs: atts = {} vals = rows.findAll('td') if vals: cash = '' for i, val in enumerate(vals): data = self.tidy_text(val.text) if i == 0: logging.debug("date ======> " + data) dp = DateParser() try: atts['date'] = dp.ymd_from_date(dp.date_from_small(data)) except: atts['date'] == '' if i == 1: if data == 'ATM': cash = 'CASH - ' if i == 2: if data != "": extra = "" datebit = "" parts = data.split(',') if len(parts) > 1: # match RBS dates - a.la. 8062 14APR11 if re.match('\d{4}\s\d\d[A-Z]{3}\d\d', parts[0]) != None: datebit = parts[0][0:4] + ' ' + parts[0][5:7] + ' ' + parts[0][7:10] # remember pretty_display strips out any words containing a sequence of 3 or more numbers parts = parts[1:] if len(parts) > 1: extra = parts[-1] parts = parts[0:-1] data = ' '.join(parts) disp = (cash + data).strip() atts['display'] = " ".join(disp.split()) atts['extradisplay'] = " ".join( (extra + " " + datebit).split()) if i > 2: # the numbers if data != "" and data != '-': logging.debug('->' + data + '<-') amount = self.normalise_ammount(data) if i == 3: atts['amount'] = amount atts['type'] = 'Credit' if i == 4: atts['amount'] = amount atts['type'] = 'Debit' if i == 5: self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1 except Exception, e: logging.exception('RBS parsing error - ' + str(e))
def processAccount(self, acCount, acName, account_path, allofit): page = self.HexToByte( allofit['body']) # save this page self.output_page("account" + str(acCount) + ".html", page) soup = BeautifulSoup(page) logging.debug('ac path - ' + str(account_path) + ' - end' ) if account_path != "": # delete existing current xactions logging.debug('Processing :) ' ) self.statementbuilder = StatementBuilder(self.facade, account_path, self.token) # need to get last statement and make a new one every time self.statementbuilder.make_recent_dif_statement('Fd-recent', 'Scraper', None) #TODO change this isVisa = False loginform=soup.find('input', attrs={'name' : 'cmd_sort_referenceAscending'}) if loginform != None: isVisa = True bal_tables=soup.findAll('table', attrs={'class' : 'fdTableBackgroundOne'}) balance_table = bal_tables[2] if balance_table <> None: vals = balance_table.findAll('td') if vals: bal = vals[1].text data = bal.replace('£', u'£'); data = data.strip(u'£') if data[-1] == 'D': data = data.replace('DB','') data = data.replace('D','') lastbal = int( float(data) * 100 ) firstbal = 0 - lastbal else: data = data.replace('CR','') data = data.replace('C','') firstbal = int( float(data) * 100 ) self.statementbuilder.set_current_balance(firstbal) logging.debug("-----------------------------*******---------------------") if isVisa: logging.debug("found visa --") acTable=soup.find('table', attrs={'class' : 'fdStatTable'}) # if no table then no new data afaik if acTable != None: datarows=acTable.findAll('tr') next = False # build the post values up atts = {} isFirst = True firstbal = 0 firstdate = "" lastbal = 0 lastdate = "" doBalance = False dp = DateParser() for rows in datarows: vals = rows.findAll('td') if vals: for i, val in enumerate(vals): if val.text: data = val.text.strip() data = unescape(data) data = unicode(data) else: data = "" if data != " ": data = data.replace(' ','') if i == 0: if data != "": try: lastdate = dp.ymd_from_date(dp.date_from_dmy(data,'/')) except: logging.warn("Invalid FD date format - probably no transactions") return if firstdate == "": firstdate = lastdate atts['date'] = lastdate if (i == 1 and not isVisa) or (i == 2 and isVisa): atts['display'] = data[0:19] atts['extradisplay'] = data[19:] if (i == 2 and not isVisa) or (i == 3 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'D') data = data.strip(u'B') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100 ) atts['type'] = 'Debit' if (i == 3 and not isVisa) or (i == 4 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'C') data = data.strip(u'R') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100 ) atts['type'] = 'Credit' if not isVisa: if i == 4: data = data.strip(u'£') if data != "": lastbal = int( float(data) * 100 ) if isFirst: isFirst = False firstbal = lastbal doBalance = True if i == 5: if doBalance: doBalance = False if data == "D": firstbal = 0 - firstbal self.statementbuilder.set_current_balance(firstbal) self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1
def _processCCAccount(self, raw, account_path, balance): soup = BeautifulSoup(raw) logging.debug('CC ac path - ' + str(account_path) + ' - end' ) try: if account_path != "": # delete existing current xactions logging.debug('Processing :) ' ) builder = StatementBuilder(self.facade, account_path, self.token) self.statementlist.append(builder) self.statementbuilder = self.statementlist[self.current_statement] # we know this is not a credit card isCCard = True # get a fixed balance somewhere?? # passed in for RBS # set up our statement self.statementbuilder.make_recent_dif_statement('RBS-recent', 'Scraper', None) # now set the final balance logging.debug("Balance - - - - - - - > " + str(balance)) self.statementbuilder.set_current_balance(balance) # now find all the recent transactions x_table = soup.find('table', attrs={'class' : 'ItemTable'}) if x_table != None: x_body = x_table.find('tbody') inputs = x_body.findAll('tr') # build the post values up for rows in inputs: atts = {} vals = rows.findAll('td') if vals: datebit = '' for i, val in enumerate(vals): data = self.tidy_text(val.text) if i == 0: logging.debug("date ======> " + data) dp = DateParser() try: atts['date'] = dp.ymd_from_date(dp.date_from_small(data)) except: atts['date'] == '' if i == 1: datebit = data[:-5] if i == 2: if data != 'SALE': # only keep extra xact date for Sales datebit = '' if i == 3: if data != "": atts['display'] = " ".join(data.split()) atts['extradisplay'] = datebit if i > 3: # the numbers if data != "" and data != '-': amount = self.normalise_ammount(data) if i == 4: atts['amount'] = amount atts['type'] = 'Credit' if i == 5: atts['amount'] = amount atts['type'] = 'Debit' if i == 5: self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1 except Exception, e: logging.exception('RBS parsing error - ' + str(e))
def _processNormAccount(self, raw, account_path, balance): soup = BeautifulSoup(raw) logging.debug('Norm ac path - ' + str(account_path) + ' - end') try: if account_path != "": # delete existing current xactions logging.debug('Processing :) norm ') builder = StatementBuilder(self.facade, account_path, self.token) self.statementlist.append(builder) self.statementbuilder = self.statementlist[ self.current_statement] # we know this is not a credit card isCCard = False # get a fixed balance somewhere?? # balance passed in for natwest # set up our statement self.statementbuilder.make_recent_dif_statement( 'NatWest-recent', 'Scraper', None) # now set the final balance logging.debug("Balance - - - - - - - > " + str(balance)) self.statementbuilder.set_current_balance(balance) # now find all the recent transactions x_table = soup.find('table', attrs={'class': 'ItemTable'}) if x_table == None: # could easily be no transactions logging.debug(" No xtable ======>") if x_table != None: x_body = x_table.find('tbody') inputs = x_body.findAll('tr') # build the post values up for rows in inputs: atts = {} vals = rows.findAll('td') if vals: cash = '' for i, val in enumerate(vals): data = self.tidy_text(val.text) if i == 0: dp = DateParser() try: atts['date'] = dp.ymd_from_date( dp.date_from_small(data)) except: atts['date'] == '' if i == 1: if data == 'ATM': cash = 'CASH - ' if i == 2: if data != "": extra = "" datebit = "" parts = data.split(',') if len(parts) > 1: # match natwest dates - a.la. 8062 14APR11 if re.match( '\d{4}\s\d\d[A-Z]{3}\d\d', parts[0]) != None: datebit = parts[0][ 0:4] + ' ' + parts[0][ 5:7] + ' ' + parts[0][ 7:10] # remember pretty_display strips out any words containing a sequence of 3 or more numbers parts = parts[1:] if len(parts) > 1: extra = parts[-1] parts = parts[0:-1] data = ' '.join(parts) disp = (cash + data).strip() atts['display'] = " ".join( disp.split()) atts['extradisplay'] = " ".join( (extra + " " + datebit).split()) if i > 2: # the numbers if data != "" and data != '-': amount = self.normalise_ammount(data) if i == 3: atts['amount'] = amount atts['type'] = 'Credit' if i == 4: atts['amount'] = amount atts['type'] = 'Debit' if i == 5: self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1 except Exception, e: logging.exception('NatWest parsing error - ' + str(e))
def processAccount(self, acCount, acName, account_path, allofit): page = self.HexToByte(allofit['body']) # save this page self.output_page("account" + str(acCount) + ".html", page) soup = BeautifulSoup(page) logging.debug('ac path - ' + str(account_path) + ' - end') if account_path != "": # delete existing current xactions logging.debug('Processing :) ') self.statementbuilder = StatementBuilder(self.facade, account_path, self.token) # need to get last statement and make a new one every time self.statementbuilder.make_recent_dif_statement( 'Fd-recent', 'Scraper', None) #TODO change this isVisa = False loginform = soup.find( 'input', attrs={'name': 'cmd_sort_referenceAscending'}) if loginform != None: isVisa = True bal_tables = soup.findAll( 'table', attrs={'class': 'fdTableBackgroundOne'}) balance_table = bal_tables[2] if balance_table <> None: vals = balance_table.findAll('td') if vals: bal = vals[1].text data = bal.replace('£', u'£') data = data.strip(u'£') if data[-1] == 'D': data = data.replace('DB', '') data = data.replace('D', '') lastbal = int(float(data) * 100) firstbal = 0 - lastbal else: data = data.replace('CR', '') data = data.replace('C', '') firstbal = int(float(data) * 100) self.statementbuilder.set_current_balance(firstbal) logging.debug( "-----------------------------*******---------------------") if isVisa: logging.debug("found visa --") acTable = soup.find('table', attrs={'class': 'fdStatTable'}) # if no table then no new data afaik if acTable != None: datarows = acTable.findAll('tr') next = False # build the post values up atts = {} isFirst = True firstbal = 0 firstdate = "" lastbal = 0 lastdate = "" doBalance = False dp = DateParser() for rows in datarows: vals = rows.findAll('td') if vals: for i, val in enumerate(vals): if val.text: data = val.text.strip() data = unescape(data) data = unicode(data) else: data = "" if data != " ": data = data.replace(' ', '') if i == 0: if data != "": try: lastdate = dp.ymd_from_date( dp.date_from_dmy(data, '/')) except: logging.warn( "Invalid FD date format - probably no transactions" ) return if firstdate == "": firstdate = lastdate atts['date'] = lastdate if (i == 1 and not isVisa) or (i == 2 and isVisa): atts['display'] = data[0:19] atts['extradisplay'] = data[19:] if (i == 2 and not isVisa) or (i == 3 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'D') data = data.strip(u'B') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100) atts['type'] = 'Debit' if (i == 3 and not isVisa) or (i == 4 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'C') data = data.strip(u'R') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100) atts['type'] = 'Credit' if not isVisa: if i == 4: data = data.strip(u'£') if data != "": lastbal = int(float(data) * 100) if isFirst: isFirst = False firstbal = lastbal doBalance = True if i == 5: if doBalance: doBalance = False if data == "D": firstbal = 0 - firstbal self.statementbuilder.set_current_balance( firstbal) self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1
################################################### # RMIT University, Melbourne # Date 27 Mar 2012 # By Emil Broegger Kjer # For questions or comments contact [email protected] ################################################### from BeautifulSoup.BeautifulSoup import BeautifulSoup import urllib2, re #### Read from URL page = urllib2.urlopen( "http://tt.metlinkmelbourne.com.au/tt/XSLT_TTB_REQUEST?command=direct&language=en&outputFormat=0&net=vic&line=02EPP&project=ttb&itdLPxx_selLineDir=R&sup=B" ) soup = BeautifulSoup(page) #### Read from file # transport_line = "epping_line" # weekday = "weekday" # direction = "true" # filestr = ('data/timetables/%s_%s_%s.html' % (transport_line, weekday, direction)) # fil = open(filestr, "r") # soup = BeautifulSoup(fil.read(), fromEncoding='utf8') # fil.close() divs = soup.html.body.findAll('div') children = divs[0].contents #### Set the timetable tt = children[1].contents[3].contents[3].contents[3].contents[1].contents[2]