def _parseComment(self, communityId, liveInfoFilePath, commentFilePath): chatList = [] if not (os.path.exists(liveInfoFilePath) and os.path.exists(commentFilePath)): return chatList infoParser = BeautifulSoup(open(liveInfoFilePath, u'r')) if not infoParser.find(u'communityid').renderContents() == communityId: return chatList commentParser = BeautifulSoup(open(commentFilePath, u'r')) chatTagList = commentParser.findAll(u'chat', attrs={u'msgkind': u'message_msg'}) for chatTag in chatTagList: communityId = communityId.decode(u'utf-8') liveId = infoParser.find(u'liveid').renderContents().decode() userId = chatTag.get(u'user').decode(u'utf-8') name = chatTag.get(u'nickname').decode(u'utf-8') message = chatTag.renderContents().decode(u'utf-8') option = chatTag.get(u'mail').decode(u'utf-8') if chatTag.get(u'mail') != '' else None date = re.sub( ur'(\d{4})/(\d{1,2})/(\d{1,2})\s(\d{1,2}):(\d{1,2}):(\d{1,2})', lambda match: u'{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}'.format(int(match.group(1)), int(match.group(2)), int(match.group(3)), int(match.group(4)), int(match.group(5)), int(match.group(6))), chatTag.get(u'date') ).decode(u'utf-8') chatList.append((communityId, liveId, userId, name, message, option, date)) return chatList
def __init__( self ): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl'])) print 'Loading ' + self.BASE_URL + unquote_plus(params['showUrl']) seasonIndex = BeautifulSoup(source) tvshowcontainer = seasonIndex.find('div',id=re.compile('scet_header|scet_top|show-header|show-header-scet|^header$')) if tvshowcontainer==None: tvshowcontainer=seasonIndex.find('div',{'class':re.compile('scet_header|scet_top|show-header|show-header-scet')}) if tvshowcontainer!=None: tvshowtitle = tvshowcontainer.find('h1').string else: tvshowtitle = re.search('var siteName = "(.+?)";',source).group(1) print 'Parsing seasons for "%s"' % tvshowtitle showsListing = seasonIndex.find('div',{"class":re.compile('scet-gallery-nav')}).find('h3',text='Full Episodes').parent.findNextSibling('ul').findAll('li') for show in showsListing: showLink = show.find('a') print 'Found '+showLink.string listitem=xbmcgui.ListItem(decode_htmlentities(showLink.string)) listitem.setInfo('video',{'tvshowtitle':tvshowtitle}) #listitem.setThumbnailImage(showLink.find('img')['src']) if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL,'') xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?seasonUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl),),totalItems=len(showsListing),isFolder=True) xbmcplugin.setContent( handle=int(sys.argv[1]), content='seasons') xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )
def __init__( self ): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['seasonUrl'])) showIndex = BeautifulSoup(source) tvshowcontainer = showIndex.find('div',id=re.compile('scet_header|scet_top|show-header|show-header-scet|^header$')) if tvshowcontainer==None: tvshowcontainer=showIndex.find('div',{'class':re.compile('scet_header|scet_top|show-header|show-header-scet')}) if tvshowcontainer!=None: tvshowtitle = tvshowcontainer.find('h1').string else: tvshowtitle = re.search('var siteName = "(.+?)";',source).group(1) pages = 1 if showIndex.find('div',{'class':re.compile('nbcu_pager')}): pageLinks = showIndex.find('div',{'class':re.compile('nbcu_pager')}).findAll('a',{'class':re.compile('nbcu_pager_page')}) pages = len(pageLinks) for i in range(0,pages): if i>0: source = self._fetch_url(self.BASE_URL + pageLinks[i]['href']) showIndex = BeautifulSoup(source) episodesListing = showIndex.find('ul',{'class':re.compile('scet_th_full')}).findAll('li') for episode in episodesListing: vidInfo = {'tvshowtitle': tvshowtitle, 'studio': 'NBC'} title = decode_htmlentities(episode.find('p',{'class':re.compile('list_full_det_title')}).find('a').string) listitem = xbmcgui.ListItem(title) listitem.setThumbnailImage(episode.find('img')['src']) episodeLink = episode.find('a') if episodeLink['href'][0] == '/': episodeUrl = episodeLink['href'][1:] else: episodeUrl = episodeLink['href'].replace(self.BASE_URL,'') if episode.find('p',{'class':re.compile('list_full_des')}): vidInfo['plot'] = decode_htmlentities(episode.find('p',{'class':re.compile('list_full_des')}).find('em').string) epNum = re.search('^Ep(?:\.\s*)?([0-9]{1,2})([0-9][0-9])(?:\s*:\s*)?(.+)$',title) if epNum != None: vidInfo['season'] = int(epNum.group(1)) vidInfo['episode'] = int(epNum.group(2)) vidInfo['title'] = epNum.group(3) #airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'}))) #seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'}))) #episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'}))) #vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2)) #vidInfo['season'] = int(seasonNum.group(1)) #vidInfo['episode'] = int(episodeNumAndDuration.group(1)) #vidInfo['duration'] = episodeNumAndDuration.group(2) #vidInfo['title'] = episode.find('h3').find('a').string #vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4)) #print vidInfo listitem.setInfo("video",vidInfo) xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?episodeUrl=%s&episode=%s&season=%s" % ( sys.argv[ 0 ], quote_plus(episodeUrl),vidInfo['episode'],vidInfo['season'])) xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_EPISODE ) xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_DATE ) xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_LABEL ) xbmcplugin.addSortMethod( handle=int( sys.argv[ 1 ] ), sortMethod=xbmcplugin.SORT_METHOD_DURATION ) xbmcplugin.setContent( handle=int(sys.argv[1]), content='episodes') xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )
def doStep3(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("natwest-security.html", page) scrape_result = 'good' logging.info("NatWest security page2") # check if we got returned # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on errorDiv = soup.findAll( 'input', attrs={'name': 'ctl00$mainContent$LI5TABA$DBID_edit'}) if len(errorDiv) != 0: logging.info("NatWest security page1 still - customer number bad") return 'credentials incorrect' # if we get here then the form was found hence creds must be wrong # find our form loginform = soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('NatWest no security form') return 'bank error' values = self.parseForm(loginform) # define some variables that would only otherwise exist in a try catch block scope # the label text split on spaces which1arr = "" which2arr = "" which3arr = "" # the chalenges firstDigit = "" secondDigit = "" thirdDigit = "" #>>>>>>> The first set of Pin fields #-------------------- get the questions --------------# #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label> useNewTab = False try: which1 = soup.find('label', attrs={ 'for': 'ctl00_mainContent_LI6PPEA_edit' }).text except Exception, e: useNewTab = True
def doStep3(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("RBS-security.html", page) scrape_result = 'good' logging.info("RBS security page2") # check if we got returned # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI5TABA$DBID_edit'}) if len(errorDiv) != 0: logging.info("RBS security page1 still - customer number bad") return 'credentials incorrect' # if we get here then the form was found hence creds must be wrong # find our form loginform=soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('RBS no security form') return 'bank error' values = self.parseForm(loginform) # define some variables that would only otherwise exist in a try catch block scope # the label text split on spaces which1arr = "" which2arr = "" which3arr = "" # the chalenges firstDigit = "" secondDigit = "" thirdDigit = "" #>>>>>>> The first set of Pin fields #-------------------- get the questions --------------# #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label> useNewTab = False try: which1=soup.find('label', attrs={'for' : 'ctl00_mainContent_LI6PPEA_edit'}).text except Exception, e: useNewTab = True
def doStep7(self, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("natwest-acclink.html", page) loginform = soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('NatWest no view account form') return 'bank error' action = self.urlBase + '/' + loginform['action'] values = self.parseForm(loginform) # fill in our selection - 1 month values['ctl00$mainContent$SS2SPDDA'] = 'M1' # default button - needed values['ctl00$mainContent$NextButton_button'] = 'View Transactions' # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 20 return 'good'
def doStep1(self, allofit, page): body = page scrape_result = 'good' logging.info("NatWest page1") # the following is how you could retrieve the headers from the request # for head in allofit['headers']: # name = self.HexToByte(head['name']) # val = self.HexToByte(head['value']) # write out the start page self.output_page("1_first_page.html", body) soup = BeautifulSoup(body) frame = soup.find('frame', id='ctl00_secframe') if frame != None: action = self.urlBase + '/' + frame['src'] #<frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&cookieid=100714"></frame> self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = "" self.response['method'] = 'GET' self.response['step'] = 2 else: logging.debug('NatWest frame link error - ') scrape_result = 'bank error' return scrape_result
def doStep2(self, allofit, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("RBS-username.html", page) loginform=soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('RBS no login form') return 'bank error' action = self.urlBase + '/' + loginform['action'] values = self.parseForm(loginform) # fill in our credentials values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds['01'] #customer number # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 3 return 'good'
def __init__( self ): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl'])) showIndex = BeautifulSoup(source) vidInfo = {'tvshowtitle': showIndex.find('div',id='showDashboard').find('span',{'class':'blueText'}).string, 'studio': 'FOX'} seasonsListing = showIndex.findAll('div',{'class':re.compile('dashPageHolder'),'id':re.compile('^fullEp')}) print len(seasonsListing) for season in seasonsListing: episodesListing = season.findAll('div',{'class':'episodeListing'}) for episode in episodesListing: listitem = xbmcgui.ListItem(episode.find('h3').find('a').string) listitem.setThumbnailImage(episode.find('img',id=re.compile('^epThumb'))['src']) episodeLink = episode.find('a',{'class':'thumbnailLink'}) if episodeLink['href'][0] == '/': episodeUrl = episodeLink['href'][1:] else: episodeUrl = episodeLink['href'].replace(self.BASE_URL,'') airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'}))) seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'}))) episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'}))) vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2)) vidInfo['season'] = int(seasonNum.group(1)) vidInfo['episode'] = int(episodeNumAndDuration.group(1)) vidInfo['duration'] = episodeNumAndDuration.group(2) vidInfo['title'] = episode.find('h3').find('a').string vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4)) print vidInfo listitem.setInfo("video",vidInfo) xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?episodeUrl=%s" % ( sys.argv[ 0 ], quote_plus(episodeUrl))) xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )
def doStep6(self, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("RBS-acclink.html", page) loginform=soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('RBS no view account form') return 'bank error' action = self.urlBase + '/' + loginform['action'] values = self.parseForm(loginform) # fill in our selection - 1 month values['ctl00$mainContent$SS2SPDDA'] = 'M1' # default button - needed values['ctl00$mainContent$NextButton_button'] = 'View Transactions' # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 20 return 'good'
def doStep2(self, allofit, page): # -------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("natwest-username.html", page) loginform = soup.find("form", attrs={"name": "aspnetForm"}) if loginform == None: logging.debug("NatWest no login form") return "bank error" action = self.urlBase + "/" + loginform["action"] values = self.parseForm(loginform) # fill in our credentials values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds["01"] # customer number # build the body content data = urllib.urlencode(values) self.response = {} self.response["url"] = self.ByteToHex(action) self.response["data"] = self.ByteToHex(data) self.response["method"] = "POST" self.response["step"] = 3 return "good"
def doStep2(self, allofit, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("natwest-username.html", page) loginform = soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('NatWest no login form') return 'bank error' action = self.urlBase + '/' + loginform['action'] values = self.parseForm(loginform) # fill in our credentials values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds[ '01'] #customer number # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 3 return 'good'
def get(self): dates = self.request.get('date') froms = self.request.get("from") to = self.request.get("to") #froms= "BOM" #to = "CBJ" resp=urlfetch.fetch("http://www.cleartrip.com/flights/results?from="+froms+"&to="+to+"&depart_date="+dates+"&adults=1&childs=0&infants=0&dep_time=0&class=Economy&airline=&carrier=&x=57&y=16&flexi_search=no&tb=n") soup = BS(resp.content) my_content = soup.find("script",{"id":"json"}) string_resp = str(my_content).strip() #self.response.out.write(str(string_resp)) resp_splitted = string_resp.split(';') #self.response.out.write(str(resp_splitted)) self.response.headers['Content-Type'] = 'text/html' self.response.out.write('<html><body><table>') a = 2-len(resp_splitted) #self.response.out.write(string_resp) #query2 = resp_splitted[-10].split('=') #self.response.out.write(query2[1]) """content = eval(query2[1]) self.response.out.write('<tr><td>Price</td>') self.response.out.write('<td>'+content['pr']+'</td></tr>') legs = content['legs'] i = 0 for leg in legs: self.response.out.write('<tr><td>Way '+str(i)+':</td></tr>') self.response.out.write('<td>'+leg['fr'] + "to "+ leg['to'] +'</td>') self.response.out.write('<tr><td>Arrival '+str(i)+':</td></tr>') self.response.out.write ('<td>'+leg['a']+'</td>') self.response.out.write('<tr><td>Departure '+str(i)+':</td></tr>') self.response.out.write ('<td>'+leg['dp']+'</td>') i+=1""" for query in range(a,-9): query2 = resp_splitted[query].strip().split('=') try: content = eval(query2[1]) self.response.out.write("<tr><td>******************</td></tr>") self.response.out.write('<tr><td>Price</td>') self.response.out.write('<td>'+str(content.get('pr'))+'</td></tr>') legs = content.get('legs') i = 0 for leg in legs: i+=1 self.response.out.write('<tr><td>Way '+str(i)+':</td>') self.response.out.write('<td>'+leg.get('fr') + " => "+ leg['to'] +'</td></tr>') self.response.out.write('<tr><td>Arrival '+str(i)+':</td>') self.response.out.write ('<td>'+str(leg.get('a'))+'</td></tr>') self.response.out.write('<tr><td>Departure '+str(i)+':</td>') self.response.out.write ('<td>'+str(leg.get('dp'))+'</td></tr>') except: pass
def DoStep3(self, allofit): scrape_result = "good" page = self.HexToByte( allofit['body']) #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("fd-summary.html", page) accountTable=soup.find('table', attrs={'class' : 'fdBalancesTable'}) if accountTable != None: self.accountLinks=accountTable.findAll('a', attrs={'class' : 'fdActionLink'}) if len(self.accountLinks) == 0: #got some kind of message scrape_result = 'bank error' logging.info('Still got no accounts') else: logging.debug("No fd table"); scrape_result = 'credentials incorrect' return scrape_result
def firstPass(self, page): soup = BeautifulSoup(page) loginform=soup.find('form') action = loginform['action'] urls = urlparse(action); self.urlBase = "https://" + urls.netloc logging.info("Base URL = " + self.urlBase) inputs = loginform.findAllNext('input') values = {} values['userid'] = self.filledCreds['03'] #username # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 2
def firstPass(self, page): soup = BeautifulSoup(page) loginform = soup.find('form') action = loginform['action'] urls = urlparse(action) self.urlBase = "https://" + urls.netloc logging.info("Base URL = " + self.urlBase) inputs = loginform.findAllNext('input') values = {} values['userid'] = self.filledCreds['03'] #username # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 2
def DoStep3(self, allofit): scrape_result = "good" page = self.HexToByte(allofit['body']) #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("fd-summary.html", page) accountTable = soup.find('table', attrs={'class': 'fdBalancesTable'}) if accountTable != None: self.accountLinks = accountTable.findAll( 'a', attrs={'class': 'fdActionLink'}) if len(self.accountLinks) == 0: #got some kind of message scrape_result = 'bank error' logging.info('Still got no accounts') else: logging.debug("No fd table") scrape_result = 'credentials incorrect' return scrape_result
def doStep1(self, allofit, page): body = page scrape_result = "good" logging.info("NatWest page1") # the following is how you could retrieve the headers from the request # for head in allofit['headers']: # name = self.HexToByte(head['name']) # val = self.HexToByte(head['value']) # write out the start page self.output_page("1_first_page.html", body) soup = BeautifulSoup(body) frame = soup.find("frame", id="ctl00_secframe") if frame != None: action = self.urlBase + "/" + frame["src"] # <frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&cookieid=100714"></frame> self.response = {} self.response["url"] = self.ByteToHex(action) self.response["data"] = "" self.response["method"] = "GET" self.response["step"] = 2 else: logging.debug("NatWest frame link error - ") scrape_result = "bank error" return scrape_result
def doStep7(self, page): # -------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("natwest-acclink.html", page) loginform = soup.find("form", attrs={"name": "aspnetForm"}) if loginform == None: logging.debug("NatWest no view account form") return "bank error" action = self.urlBase + "/" + loginform["action"] values = self.parseForm(loginform) # fill in our selection - 1 month values["ctl00$mainContent$SS2SPDDA"] = "M1" # default button - needed values["ctl00$mainContent$NextButton_button"] = "View Transactions" # build the body content data = urllib.urlencode(values) self.response = {} self.response["url"] = self.ByteToHex(action) self.response["data"] = self.ByteToHex(data) self.response["method"] = "POST" self.response["step"] = 20 return "good"
def __init__(self): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl'])) print 'Loading ' + self.BASE_URL + unquote_plus(params['showUrl']) seasonIndex = BeautifulSoup(source) tvshowcontainer = seasonIndex.find( 'div', id=re.compile( 'scet_header|scet_top|show-header|show-header-scet|^header$')) if tvshowcontainer == None: tvshowcontainer = seasonIndex.find( 'div', { 'class': re.compile( 'scet_header|scet_top|show-header|show-header-scet') }) if tvshowcontainer != None: tvshowtitle = tvshowcontainer.find('h1').string else: tvshowtitle = re.search('var siteName = "(.+?)";', source).group(1) print 'Parsing seasons for "%s"' % tvshowtitle showsListing = seasonIndex.find('div', { "class": re.compile('scet-gallery-nav') }).find( 'h3', text='Full Episodes').parent.findNextSibling('ul').findAll('li') for show in showsListing: showLink = show.find('a') print 'Found ' + showLink.string listitem = xbmcgui.ListItem(decode_htmlentities(showLink.string)) listitem.setInfo('video', {'tvshowtitle': tvshowtitle}) #listitem.setThumbnailImage(showLink.find('img')['src']) if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL, '') xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), listitem=listitem, url="%s?seasonUrl=%s" % ( sys.argv[0], quote_plus(showUrl), ), totalItems=len(showsListing), isFolder=True) xbmcplugin.setContent(handle=int(sys.argv[1]), content='seasons') xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)
def __init__(self): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl'])) showIndex = BeautifulSoup(source) vidInfo = { 'tvshowtitle': showIndex.find('div', id='showDashboard').find('span', { 'class': 'blueText' }).string, 'studio': 'FOX' } seasonsListing = showIndex.findAll( 'div', { 'class': re.compile('dashPageHolder'), 'id': re.compile('^fullEp') }) print len(seasonsListing) for season in seasonsListing: episodesListing = season.findAll('div', {'class': 'episodeListing'}) for episode in episodesListing: listitem = xbmcgui.ListItem( episode.find('h3').find('a').string) listitem.setThumbnailImage( episode.find('img', id=re.compile('^epThumb'))['src']) episodeLink = episode.find('a', {'class': 'thumbnailLink'}) if episodeLink['href'][0] == '/': episodeUrl = episodeLink['href'][1:] else: episodeUrl = episodeLink['href'].replace(self.BASE_URL, '') airedDateAndPlot = re.search( 'Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$', str(episode.find('div', {'class': 'episodeInfo'}))) seasonNum = re.search( 'Season\s+([0-9]+?)[\s:]', str(episode.find('p', {'class': 'seasonNum'}))) episodeNumAndDuration = re.search( 'Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)', str(episode.find('p', {'class': 'episodeNumLine'}))) vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3), airedDateAndPlot.group(1), airedDateAndPlot.group(2)) vidInfo['season'] = int(seasonNum.group(1)) vidInfo['episode'] = int(episodeNumAndDuration.group(1)) vidInfo['duration'] = episodeNumAndDuration.group(2) vidInfo['title'] = episode.find('h3').find('a').string vidInfo['plot'] = decode_htmlentities( airedDateAndPlot.group(4)) print vidInfo listitem.setInfo("video", vidInfo) xbmcplugin.addDirectoryItem( handle=int(sys.argv[1]), listitem=listitem, url="%s?episodeUrl=%s" % (sys.argv[0], quote_plus(episodeUrl))) xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)
def __init__( self ): source = self._fetch_url(self.BASE_FOD_URL) fodIndex = BeautifulSoup(source) showsListing = fodIndex.find('div',id='episodes-listing').findAll('li') for show in showsListing: showLink = show.find('a') if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL,'') xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=xbmcgui.ListItem(showLink.string),url="%s?showUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl)),totalItems=len(showsListing),isFolder=True) xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )
def _loadComment(self, communityId, userSettingFilePath, commentLogFolder): nameDict = self._loadUserSetting(communityId, userSettingFilePath) commentLogFileList = filter(lambda file: re.match(ur'ncvLog_lv\d+-{0}\.xml$'.format(communityId), file) , os.listdir(commentLogFolder)) chatList = [] for commentFile in commentLogFileList: parser = BeautifulSoup(open(os.path.join(commentLogFolder, commentFile), u'r')) liveId = u'lv' + parser.find(u'livenum').renderContents().decode(u'utf-8') chatTagList = parser.find(u'livecommentdataarray').findAll(u'chat', recursive=False) for chatTag in chatTagList: userId = chatTag.get(u'user_id') if chatTag.get(u'user_id') == u'': continue name = nameDict.get(userId) message = chatTag.renderContents().decode(u'utf-8') option = chatTag.get(u'mail') unixtime = time.localtime(int(chatTag.get(u'date'))) date = (datetime.datetime(*unixtime[:-3]).strftime(u'%Y-%m-%d %H:%M:%S') if unixtime else None).decode(u'utf-8') chatList.append((communityId, liveId, userId, name, message, option, date)) return chatList
def DoStep2(self, allofit): page = self.HexToByte(allofit['body']) #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("fd-username.html", page) loginform = soup.find('form') action = loginform['action'] inputs = loginform.findAllNext('input') values = {} self.response = {} # build the post values up - there arent any others afaik ps = loginform.findAllNext('p') numbers = ps[1].findAllNext('strong') #not enough lookup digits try: password = self.lookupdigit(numbers[0].text) + self.lookupdigit( numbers[1].text) + self.lookupdigit(numbers[2].text) except: logging.debug("credentials incorrect") return 'credentials incorrect' answer = self.filledCreds['06'] values['password'] = password values['memorableAnswer'] = answer # build the body content data = urllib.urlencode(values) self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 3 return 'good'
def DoStep2(self, allofit): page = self.HexToByte( allofit['body']) #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("fd-username.html", page) loginform=soup.find('form') action = loginform['action'] inputs = loginform.findAllNext('input') values = {} self.response = {} # build the post values up - there arent any others afaik ps = loginform.findAllNext('p') numbers = ps[1].findAllNext('strong') #not enough lookup digits try: password = self.lookupdigit(numbers[0].text) + self.lookupdigit(numbers[1].text) + self.lookupdigit(numbers[2].text) except: logging.debug("credentials incorrect") return 'credentials incorrect' answer = self.filledCreds['06'] values['password'] = password values['memorableAnswer'] = answer # build the body content data = urllib.urlencode(values) self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 3 return 'good'
def __init__(self): source = self._fetch_url(self.BASE_FOD_URL) fodIndex = BeautifulSoup(source) showsListing = fodIndex.find('div', id='episodes-listing').findAll('li') for show in showsListing: showLink = show.find('a') if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL, '') xbmcplugin.addDirectoryItem( handle=int(sys.argv[1]), listitem=xbmcgui.ListItem(showLink.string), url="%s?showUrl=%s" % (sys.argv[0], quote_plus(showUrl)), totalItems=len(showsListing), isFolder=True) xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)
def doStep1(self, allofit, page): body = page scrape_result = 'good' logging.info("RBS page1") # the following is how you could retrieve the headers from the request # for head in allofit['headers']: # name = self.HexToByte(head['name']) # val = self.HexToByte(head['value']) # write out the start page self.output_page("1_first_page.html", body) soup = BeautifulSoup(body); frame = soup.find('frame', id='ctl00_secframe'); if frame != None: action = self.urlBase + '/' + frame['src']; #<frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&cookieid=100714"></frame> self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = "" self.response['method'] = 'GET' self.response['step'] = 2 else: logging.debug('RBS frame link error - ') scrape_result = 'bank error' return scrape_result
def __init__( self ): print 'Fetching %s' % self.INDEX_URL source = self._fetch_url(self.INDEX_URL) fodIndex = BeautifulSoup(source) showsListing = fodIndex.find('div',{"class":re.compile('group-full-eps')}).findAll('li') print 'Parsed listing and found %d shows' % len(showsListing) for show in showsListing: showLink = show.find('a') listitem=xbmcgui.ListItem(decode_htmlentities(showLink['title'])) episodeCount = show.find('div',text=re.compile('^[0-9]+ Videos?$')) if episodeCount: episodeCount = int(re.search('^([0-9]+)\s*Videos?$',episodeCount.string).group(1)) print 'Found "%s" with %d episodes' % (decode_htmlentities(showLink['title']),episodeCount) listitem.setInfo('video',{'episode':episodeCount}) else: print 'Found "%s" but did not find how many episodes' % decode_htmlentities(showLink['title']) listitem.setThumbnailImage(showLink.find('img')['src']) if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL,'') xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?showUrl=%s" % ( sys.argv[ 0 ], quote_plus(showUrl)),totalItems=len(showsListing),isFolder=True) xbmcplugin.setContent(handle=int(sys.argv[1]), content='tvshows') xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )
def __init__(self): print 'Fetching %s' % self.INDEX_URL source = self._fetch_url(self.INDEX_URL) fodIndex = BeautifulSoup(source) showsListing = fodIndex.find('div', { "class": re.compile('group-full-eps') }).findAll('li') print 'Parsed listing and found %d shows' % len(showsListing) for show in showsListing: showLink = show.find('a') listitem = xbmcgui.ListItem(decode_htmlentities(showLink['title'])) episodeCount = show.find('div', text=re.compile('^[0-9]+ Videos?$')) if episodeCount: episodeCount = int( re.search('^([0-9]+)\s*Videos?$', episodeCount.string).group(1)) print 'Found "%s" with %d episodes' % (decode_htmlentities( showLink['title']), episodeCount) listitem.setInfo('video', {'episode': episodeCount}) else: print 'Found "%s" but did not find how many episodes' % decode_htmlentities( showLink['title']) listitem.setThumbnailImage(showLink.find('img')['src']) if showLink['href'][0] == '/': showUrl = showLink['href'][1:] else: showUrl = showLink['href'].replace(self.BASE_URL, '') xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), listitem=listitem, url="%s?showUrl=%s" % (sys.argv[0], quote_plus(showUrl)), totalItems=len(showsListing), isFolder=True) xbmcplugin.setContent(handle=int(sys.argv[1]), content='tvshows') xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)
def doStep4(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("RBS-pos-accounts.html", page) scrape_result = 'good' logging.info("RBS message or bad cred check ") # if we still have the input then def bad credentials errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI6PPEA_edit'}) if len(errorDiv) != 0: logging.info("RBS defiantely bad credentials") return 'credentials incorrect' accountBLock=soup.findAll('table', attrs={'class' : 'AccountTable'}) # got some acount details so all good if len(accountBLock) > 0: logging.debug("RBS defiantely got some good accounts") return 'good'; # find any link # if we find a link return it # check for the normal continue button and fail all else - with credentials failure continueButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_FinishButton_button'}) if(continueButton == None): logging.warning("RBS cant find finish button credentials incorrect") nextButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_NextButton_button'}) if(nextButton == None): logging.warning("RBS cant find next button either") return 'credentials incorrect' # now find the form that these buttons belong to loginform=soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('RBS no continue form') return 'bank error' action = self.urlBase + '/' + loginform['action'] # any hidden values etc values = self.parseForm(loginform) # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 4 return 'messages'
def get(self): dates = self.request.get('date') froms = self.request.get("from") to = self.request.get("to") #froms= "BOM" #to = "CBJ" resp = urlfetch.fetch( "http://www.cleartrip.com/flights/results?from=" + froms + "&to=" + to + "&depart_date=" + dates + "&adults=1&childs=0&infants=0&dep_time=0&class=Economy&airline=&carrier=&x=57&y=16&flexi_search=no&tb=n" ) soup = BS(resp.content) my_content = soup.find("script", {"id": "json"}) string_resp = str(my_content).strip() #self.response.out.write(str(string_resp)) resp_splitted = string_resp.split(';') #self.response.out.write(str(resp_splitted)) self.response.headers['Content-Type'] = 'text/html' self.response.out.write('<html><body><table>') a = 2 - len(resp_splitted) #self.response.out.write(string_resp) #query2 = resp_splitted[-10].split('=') #self.response.out.write(query2[1]) """content = eval(query2[1]) self.response.out.write('<tr><td>Price</td>') self.response.out.write('<td>'+content['pr']+'</td></tr>') legs = content['legs'] i = 0 for leg in legs: self.response.out.write('<tr><td>Way '+str(i)+':</td></tr>') self.response.out.write('<td>'+leg['fr'] + "to "+ leg['to'] +'</td>') self.response.out.write('<tr><td>Arrival '+str(i)+':</td></tr>') self.response.out.write ('<td>'+leg['a']+'</td>') self.response.out.write('<tr><td>Departure '+str(i)+':</td></tr>') self.response.out.write ('<td>'+leg['dp']+'</td>') i+=1""" for query in range(a, -9): query2 = resp_splitted[query].strip().split('=') try: content = eval(query2[1]) self.response.out.write("<tr><td>******************</td></tr>") self.response.out.write('<tr><td>Price</td>') self.response.out.write('<td>' + str(content.get('pr')) + '</td></tr>') legs = content.get('legs') i = 0 for leg in legs: i += 1 self.response.out.write('<tr><td>Way ' + str(i) + ':</td>') self.response.out.write('<td>' + leg.get('fr') + " => " + leg['to'] + '</td></tr>') self.response.out.write('<tr><td>Arrival ' + str(i) + ':</td>') self.response.out.write('<td>' + str(leg.get('a')) + '</td></tr>') self.response.out.write('<tr><td>Departure ' + str(i) + ':</td>') self.response.out.write('<td>' + str(leg.get('dp')) + '</td></tr>') except: pass
def __init__(self): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['seasonUrl'])) showIndex = BeautifulSoup(source) tvshowcontainer = showIndex.find( 'div', id=re.compile( 'scet_header|scet_top|show-header|show-header-scet|^header$')) if tvshowcontainer == None: tvshowcontainer = showIndex.find( 'div', { 'class': re.compile( 'scet_header|scet_top|show-header|show-header-scet') }) if tvshowcontainer != None: tvshowtitle = tvshowcontainer.find('h1').string else: tvshowtitle = re.search('var siteName = "(.+?)";', source).group(1) pages = 1 if showIndex.find('div', {'class': re.compile('nbcu_pager')}): pageLinks = showIndex.find('div', { 'class': re.compile('nbcu_pager') }).findAll('a', {'class': re.compile('nbcu_pager_page')}) pages = len(pageLinks) for i in range(0, pages): if i > 0: source = self._fetch_url(self.BASE_URL + pageLinks[i]['href']) showIndex = BeautifulSoup(source) episodesListing = showIndex.find( 'ul', { 'class': re.compile('scet_th_full') }).findAll('li') for episode in episodesListing: vidInfo = {'tvshowtitle': tvshowtitle, 'studio': 'NBC'} title = decode_htmlentities( episode.find('p', { 'class': re.compile('list_full_det_title') }).find('a').string) listitem = xbmcgui.ListItem(title) listitem.setThumbnailImage(episode.find('img')['src']) episodeLink = episode.find('a') if episodeLink['href'][0] == '/': episodeUrl = episodeLink['href'][1:] else: episodeUrl = episodeLink['href'].replace(self.BASE_URL, '') if episode.find('p', {'class': re.compile('list_full_des')}): vidInfo['plot'] = decode_htmlentities( episode.find('p', { 'class': re.compile('list_full_des') }).find('em').string) epNum = re.search( '^Ep(?:\.\s*)?([0-9]{1,2})([0-9][0-9])(?:\s*:\s*)?(.+)$', title) if epNum != None: vidInfo['season'] = int(epNum.group(1)) vidInfo['episode'] = int(epNum.group(2)) vidInfo['title'] = epNum.group(3) #airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'}))) #seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'}))) #episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'}))) #vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2)) #vidInfo['season'] = int(seasonNum.group(1)) #vidInfo['episode'] = int(episodeNumAndDuration.group(1)) #vidInfo['duration'] = episodeNumAndDuration.group(2) #vidInfo['title'] = episode.find('h3').find('a').string #vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4)) #print vidInfo listitem.setInfo("video", vidInfo) xbmcplugin.addDirectoryItem( handle=int(sys.argv[1]), listitem=listitem, url="%s?episodeUrl=%s&episode=%s&season=%s" % (sys.argv[0], quote_plus(episodeUrl), vidInfo['episode'], vidInfo['season'])) xbmcplugin.addSortMethod(handle=int(sys.argv[1]), sortMethod=xbmcplugin.SORT_METHOD_EPISODE) xbmcplugin.addSortMethod(handle=int(sys.argv[1]), sortMethod=xbmcplugin.SORT_METHOD_DATE) xbmcplugin.addSortMethod(handle=int(sys.argv[1]), sortMethod=xbmcplugin.SORT_METHOD_LABEL) xbmcplugin.addSortMethod(handle=int(sys.argv[1]), sortMethod=xbmcplugin.SORT_METHOD_DURATION) xbmcplugin.setContent(handle=int(sys.argv[1]), content='episodes') xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)
def _processNormAccount(self, raw, account_path, balance): soup = BeautifulSoup(raw) logging.debug('Norm ac path - ' + str(account_path) + ' - end' ) try: if account_path != "": # delete existing current xactions logging.debug('Processing :) norm ' ) builder = StatementBuilder(self.facade, account_path, self.token) self.statementlist.append(builder) self.statementbuilder = self.statementlist[self.current_statement] # we know this is not a credit card isCCard = False # get a fixed balance somewhere?? # balance passed in for RBS # set up our statement self.statementbuilder.make_recent_dif_statement('RBS-recent', 'Scraper', None) # now set the final balance logging.debug("Balance - - - - - - - > " + str(balance)) self.statementbuilder.set_current_balance(balance) # now find all the recent transactions x_table = soup.find('table', attrs={'class' : 'ItemTable'}) if x_table == None: # could easily be no transactions logging.debug(" No xtable ======>") if x_table != None: x_body = x_table.find('tbody') inputs = x_body.findAll('tr') # build the post values up for rows in inputs: atts = {} vals = rows.findAll('td') if vals: cash = '' for i, val in enumerate(vals): data = self.tidy_text(val.text) if i == 0: logging.debug("date ======> " + data) dp = DateParser() try: atts['date'] = dp.ymd_from_date(dp.date_from_small(data)) except: atts['date'] == '' if i == 1: if data == 'ATM': cash = 'CASH - ' if i == 2: if data != "": extra = "" datebit = "" parts = data.split(',') if len(parts) > 1: # match RBS dates - a.la. 8062 14APR11 if re.match('\d{4}\s\d\d[A-Z]{3}\d\d', parts[0]) != None: datebit = parts[0][0:4] + ' ' + parts[0][5:7] + ' ' + parts[0][7:10] # remember pretty_display strips out any words containing a sequence of 3 or more numbers parts = parts[1:] if len(parts) > 1: extra = parts[-1] parts = parts[0:-1] data = ' '.join(parts) disp = (cash + data).strip() atts['display'] = " ".join(disp.split()) atts['extradisplay'] = " ".join( (extra + " " + datebit).split()) if i > 2: # the numbers if data != "" and data != '-': logging.debug('->' + data + '<-') amount = self.normalise_ammount(data) if i == 3: atts['amount'] = amount atts['type'] = 'Credit' if i == 4: atts['amount'] = amount atts['type'] = 'Debit' if i == 5: self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1 except Exception, e: logging.exception('RBS parsing error - ' + str(e))
def _processCCAccount(self, raw, account_path, balance): soup = BeautifulSoup(raw) logging.debug('CC ac path - ' + str(account_path) + ' - end' ) try: if account_path != "": # delete existing current xactions logging.debug('Processing :) ' ) builder = StatementBuilder(self.facade, account_path, self.token) self.statementlist.append(builder) self.statementbuilder = self.statementlist[self.current_statement] # we know this is not a credit card isCCard = True # get a fixed balance somewhere?? # passed in for RBS # set up our statement self.statementbuilder.make_recent_dif_statement('RBS-recent', 'Scraper', None) # now set the final balance logging.debug("Balance - - - - - - - > " + str(balance)) self.statementbuilder.set_current_balance(balance) # now find all the recent transactions x_table = soup.find('table', attrs={'class' : 'ItemTable'}) if x_table != None: x_body = x_table.find('tbody') inputs = x_body.findAll('tr') # build the post values up for rows in inputs: atts = {} vals = rows.findAll('td') if vals: datebit = '' for i, val in enumerate(vals): data = self.tidy_text(val.text) if i == 0: logging.debug("date ======> " + data) dp = DateParser() try: atts['date'] = dp.ymd_from_date(dp.date_from_small(data)) except: atts['date'] == '' if i == 1: datebit = data[:-5] if i == 2: if data != 'SALE': # only keep extra xact date for Sales datebit = '' if i == 3: if data != "": atts['display'] = " ".join(data.split()) atts['extradisplay'] = datebit if i > 3: # the numbers if data != "" and data != '-': amount = self.normalise_ammount(data) if i == 4: atts['amount'] = amount atts['type'] = 'Credit' if i == 5: atts['amount'] = amount atts['type'] = 'Debit' if i == 5: self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1 except Exception, e: logging.exception('RBS parsing error - ' + str(e))
def processAccount(self, acCount, acName, account_path, allofit): page = self.HexToByte(allofit['body']) # save this page self.output_page("account" + str(acCount) + ".html", page) soup = BeautifulSoup(page) logging.debug('ac path - ' + str(account_path) + ' - end') if account_path != "": # delete existing current xactions logging.debug('Processing :) ') self.statementbuilder = StatementBuilder(self.facade, account_path, self.token) # need to get last statement and make a new one every time self.statementbuilder.make_recent_dif_statement( 'Fd-recent', 'Scraper', None) #TODO change this isVisa = False loginform = soup.find( 'input', attrs={'name': 'cmd_sort_referenceAscending'}) if loginform != None: isVisa = True bal_tables = soup.findAll( 'table', attrs={'class': 'fdTableBackgroundOne'}) balance_table = bal_tables[2] if balance_table <> None: vals = balance_table.findAll('td') if vals: bal = vals[1].text data = bal.replace('£', u'£') data = data.strip(u'£') if data[-1] == 'D': data = data.replace('DB', '') data = data.replace('D', '') lastbal = int(float(data) * 100) firstbal = 0 - lastbal else: data = data.replace('CR', '') data = data.replace('C', '') firstbal = int(float(data) * 100) self.statementbuilder.set_current_balance(firstbal) logging.debug( "-----------------------------*******---------------------") if isVisa: logging.debug("found visa --") acTable = soup.find('table', attrs={'class': 'fdStatTable'}) # if no table then no new data afaik if acTable != None: datarows = acTable.findAll('tr') next = False # build the post values up atts = {} isFirst = True firstbal = 0 firstdate = "" lastbal = 0 lastdate = "" doBalance = False dp = DateParser() for rows in datarows: vals = rows.findAll('td') if vals: for i, val in enumerate(vals): if val.text: data = val.text.strip() data = unescape(data) data = unicode(data) else: data = "" if data != " ": data = data.replace(' ', '') if i == 0: if data != "": try: lastdate = dp.ymd_from_date( dp.date_from_dmy(data, '/')) except: logging.warn( "Invalid FD date format - probably no transactions" ) return if firstdate == "": firstdate = lastdate atts['date'] = lastdate if (i == 1 and not isVisa) or (i == 2 and isVisa): atts['display'] = data[0:19] atts['extradisplay'] = data[19:] if (i == 2 and not isVisa) or (i == 3 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'D') data = data.strip(u'B') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100) atts['type'] = 'Debit' if (i == 3 and not isVisa) or (i == 4 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'C') data = data.strip(u'R') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100) atts['type'] = 'Credit' if not isVisa: if i == 4: data = data.strip(u'£') if data != "": lastbal = int(float(data) * 100) if isFirst: isFirst = False firstbal = lastbal doBalance = True if i == 5: if doBalance: doBalance = False if data == "D": firstbal = 0 - firstbal self.statementbuilder.set_current_balance( firstbal) self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1
def _processNormAccount(self, raw, account_path, balance): soup = BeautifulSoup(raw) logging.debug("Norm ac path - " + str(account_path) + " - end") try: if account_path != "": # delete existing current xactions logging.debug("Processing :) norm ") builder = StatementBuilder(self.facade, account_path, self.token) self.statementlist.append(builder) self.statementbuilder = self.statementlist[self.current_statement] # we know this is not a credit card isCCard = False # get a fixed balance somewhere?? # balance passed in for natwest # set up our statement self.statementbuilder.make_recent_dif_statement("NatWest-recent", "Scraper", None) # now set the final balance logging.debug("Balance - - - - - - - > " + str(balance)) self.statementbuilder.set_current_balance(balance) # now find all the recent transactions x_table = soup.find("table", attrs={"class": "ItemTable"}) if x_table == None: # could easily be no transactions logging.debug(" No xtable ======>") if x_table != None: x_body = x_table.find("tbody") inputs = x_body.findAll("tr") # build the post values up for rows in inputs: atts = {} vals = rows.findAll("td") if vals: cash = "" for i, val in enumerate(vals): data = self.tidy_text(val.text) if i == 0: dp = DateParser() try: atts["date"] = dp.ymd_from_date(dp.date_from_small(data)) except: atts["date"] == "" if i == 1: if data == "ATM": cash = "CASH - " if i == 2: if data != "": extra = "" datebit = "" parts = data.split(",") if len(parts) > 1: # match natwest dates - a.la. 8062 14APR11 if re.match("\d{4}\s\d\d[A-Z]{3}\d\d", parts[0]) != None: datebit = parts[0][0:4] + " " + parts[0][5:7] + " " + parts[0][7:10] # remember pretty_display strips out any words containing a sequence of 3 or more numbers parts = parts[1:] if len(parts) > 1: extra = parts[-1] parts = parts[0:-1] data = " ".join(parts) disp = (cash + data).strip() atts["display"] = " ".join(disp.split()) atts["extradisplay"] = " ".join((extra + " " + datebit).split()) if i > 2: # the numbers if data != "" and data != "-": amount = self.normalise_ammount(data) if i == 3: atts["amount"] = amount atts["type"] = "Credit" if i == 4: atts["amount"] = amount atts["type"] = "Debit" if i == 5: self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1 except Exception, e: logging.exception("NatWest parsing error - " + str(e))
class ProntuarioVeiculo(object): ''' classdocs ''' def __init__(self, prontuario_html): ''' Constructor ''' self.__soup = \ BeautifulSoup(prontuario_html, fromEncoding='iso-8859-1', convertEntities=BeautifulSoup.HTML_ENTITIES) self.__prontuario = {} self._parsear_dados_veiculo() self._parsear_debitos() self._parsear_infracoes_em_autuacao() self._parsear_listagem_multas() self._parsear_historico_multas() self._parsear_ultimo_processo() self._parsear_recurso_infracao() def obter_prontuario(self): return deepcopy(self.__prontuario) def _parsear_dados_veiculo(self): tabela = self.__soup.find('div', id='div_servicos_02' ).table.tbody for celula in tabela.findAll('td'): dado = celula.findAll(text=True) if len(dado) == 2: self.__prontuario[dado[0].strip()] = dado[1].strip() def _parsear_debitos(self): tabela = self.__soup.find('div', id='div_servicos_03' ).table.tbody debitos = [] for linha in tabela.findAll('tr')[1:-1]: debito = {} texto = linha.td.findAll(text=True) if texto == None: texto = '' debito[u'Classe'] = ''.join(texto).strip() link = '' if linha.td.a != None: link = linha.td.a['href'].strip() debito[u'Link'] = link celulas = [u'Número DetranNet', u'Vencimento', u'Valor Nominal(R$)', u'Multa(R$)', u'Juros(R$)', u'Valor Atual(R$)'] for celula, valor in zip(celulas, linha.findAll('td')[1:]): debito[celula] = valor.string.strip() debitos.append(debito) self.__prontuario[u'Débitos'] = debitos def _parsear_infracoes_em_autuacao(self): tabela = self.__soup.find('div', id='div_servicos_10' ).table.tbody celula_filha = lambda tag: tag.name == 'td' and tag.table == None celulas = tabela.findAll(celula_filha)[3:] infracoes = [] for i in range(len(celulas)/7): linha = 7 * i infracao = {} infracao[u'Número'] = celulas[linha].a.string infracao[u'Link'] = celulas[linha].a['href'].strip() infracao[u'Valor'] = celulas[linha + 1].string.strip() infracao[u'Situação'] = celulas[linha + 2].string.strip() infracao[u'Descrição 1'] = celulas[linha + 3].string.strip() infracao[u'Descrição 2'] = celulas[linha + 4].string.strip() infracao[u'Local/Complemento 1'] = celulas[linha + 5].string.strip() infracao[u'Local/Complemento 2'] = celulas[linha + 6].string if infracao[u'Local/Complemento 2'] == None: infracao[u'Local/Complemento 2'] = u'' infracao[u'Local/Complemento 2'] = infracao[u'Local/Complemento 2'].strip() infracoes.append(infracao) self.__prontuario[u'Infrações em Autuação'] = infracoes #TODO: Implementar def _parsear_listagem_multas(self): tabela = self.__soup.find('div', id='div_servicos_04' ).table.tbody if tabela.tr.td.find(text=re.compile(u'Nenhuma?')): self.__prontuario[u'Listagem de Multas'] = [] return def _parsear_historico_multas(self): tabela = self.__soup.find('div', id='div_servicos_07' ).table.tbody celula_filha = lambda tag: tag.name == 'td' and tag.table == None celulas = tabela.findAll(celula_filha)[3:] multas = [] for i in range(len(celulas)/7): linha = 7 * i multa = {} multa[u'Número'] = celulas[linha].a.string multa[u'Link'] = celulas[linha].a['href'].strip() multa[u'Lançamento'] = celulas[linha + 1].string.strip() multa[u'Pagamento'] = celulas[linha + 2].string.strip() multa[u'Descrição 1'] = celulas[linha + 3].string.strip() multa[u'Descrição 2'] = celulas[linha + 4].string.strip() multa[u'Local/Complemento 1'] = celulas[linha + 5].string.strip() multa[u'Local/Complemento 2'] = celulas[linha + 6].string if multa[u'Local/Complemento 2'] == None: multa[u'Local/Complemento 2'] = u'' multa[u'Local/Complemento 2'] = multa[u'Local/Complemento 2'].strip() multas.append(multa) self.__prontuario[u'Histórico de Multas'] = multas def _parsear_ultimo_processo(self): tabela = self.__soup.find('div', id='div_servicos_11' ).table.tbody ultimo_processo = {} celulas = tabela.findAll('td') for celula in celulas[:5]: dado = celula.findAll(text=True) ultimo_processo[dado[0]] = dado[1] for i in range(7, len(celulas), 2): chave = celulas[i].findAll(text=True)[0] valor = celulas[i + 1].findAll(text=True)[0] ultimo_processo[chave] = valor self.__prontuario[u'Último Processo'] = ultimo_processo #TODO: Implementar def _parsear_recurso_infracao(self): tabela = self.__soup.find('div', id='div_servicos_09' ).table.tbody if tabela.tr.td.find(text=re.compile(u'Nenhuma?')): self.__prontuario[u'Recurso de Infração'] = [] return
def _processNormAccount(self, raw, account_path, balance): soup = BeautifulSoup(raw) logging.debug('Norm ac path - ' + str(account_path) + ' - end') try: if account_path != "": # delete existing current xactions logging.debug('Processing :) norm ') builder = StatementBuilder(self.facade, account_path, self.token) self.statementlist.append(builder) self.statementbuilder = self.statementlist[ self.current_statement] # we know this is not a credit card isCCard = False # get a fixed balance somewhere?? # balance passed in for natwest # set up our statement self.statementbuilder.make_recent_dif_statement( 'NatWest-recent', 'Scraper', None) # now set the final balance logging.debug("Balance - - - - - - - > " + str(balance)) self.statementbuilder.set_current_balance(balance) # now find all the recent transactions x_table = soup.find('table', attrs={'class': 'ItemTable'}) if x_table == None: # could easily be no transactions logging.debug(" No xtable ======>") if x_table != None: x_body = x_table.find('tbody') inputs = x_body.findAll('tr') # build the post values up for rows in inputs: atts = {} vals = rows.findAll('td') if vals: cash = '' for i, val in enumerate(vals): data = self.tidy_text(val.text) if i == 0: dp = DateParser() try: atts['date'] = dp.ymd_from_date( dp.date_from_small(data)) except: atts['date'] == '' if i == 1: if data == 'ATM': cash = 'CASH - ' if i == 2: if data != "": extra = "" datebit = "" parts = data.split(',') if len(parts) > 1: # match natwest dates - a.la. 8062 14APR11 if re.match( '\d{4}\s\d\d[A-Z]{3}\d\d', parts[0]) != None: datebit = parts[0][ 0:4] + ' ' + parts[0][ 5:7] + ' ' + parts[0][ 7:10] # remember pretty_display strips out any words containing a sequence of 3 or more numbers parts = parts[1:] if len(parts) > 1: extra = parts[-1] parts = parts[0:-1] data = ' '.join(parts) disp = (cash + data).strip() atts['display'] = " ".join( disp.split()) atts['extradisplay'] = " ".join( (extra + " " + datebit).split()) if i > 2: # the numbers if data != "" and data != '-': amount = self.normalise_ammount(data) if i == 3: atts['amount'] = amount atts['type'] = 'Credit' if i == 4: atts['amount'] = amount atts['type'] = 'Debit' if i == 5: self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1 except Exception, e: logging.exception('NatWest parsing error - ' + str(e))
def _processCCAccount(self, raw, account_path, balance): soup = BeautifulSoup(raw) logging.debug('CC ac path - ' + str(account_path) + ' - end') try: if account_path != "": # delete existing current xactions logging.debug('Processing :) ') builder = StatementBuilder(self.facade, account_path, self.token) self.statementlist.append(builder) self.statementbuilder = self.statementlist[ self.current_statement] # we know this is not a credit card isCCard = True # get a fixed balance somewhere?? # passed in for natwest # set up our statement self.statementbuilder.make_recent_dif_statement( 'NatWest-recent', 'Scraper', None) # now set the final balance logging.debug("Balance - - - - - - - > " + str(balance)) self.statementbuilder.set_current_balance(balance) # now find all the recent transactions x_table = soup.find('table', attrs={'class': 'ItemTable'}) if x_table != None: x_body = x_table.find('tbody') inputs = x_body.findAll('tr') # build the post values up for rows in inputs: atts = {} vals = rows.findAll('td') if vals: datebit = '' for i, val in enumerate(vals): data = self.tidy_text(val.text) if i == 0: dp = DateParser() try: atts['date'] = dp.ymd_from_date( dp.date_from_small(data)) except: atts['date'] == '' if i == 1: datebit = data[:-5] if i == 2: if data != 'SALE': # only keep extra xact date for Sales datebit = '' if i == 3: if data != "": atts['display'] = " ".join( data.split()).encode('utf8') atts['extradisplay'] = datebit.encode( 'utf8') if i > 3: # the numbers if data != "" and data != '-': amount = self.normalise_ammount(data) if i == 4: atts['amount'] = amount atts['type'] = 'Credit' if i == 5: atts['amount'] = amount atts['type'] = 'Debit' if i == 5: self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1 except Exception, e: logging.exception('NatWest parsing error - ' + str(e))
def doStep4(self, allofit, page): scrape_result = "good" # -------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("natwest-pos-accounts.html", page) scrape_result = "good" logging.info("NatWest message or bad cred check ") # if we still have the input then def bad credentials errorDiv = soup.findAll("input", attrs={"name": "ctl00$mainContent$LI6PPEA_edit"}) if len(errorDiv) != 0: logging.info("NatWest defiantely bad credentials") return "credentials incorrect" accountBLock = soup.findAll("table", attrs={"class": "AccountTable"}) # got some acount details so all good if len(accountBLock) > 0: logging.debug("NatWest defiantely got some good accounts") return "good" # find any link # if we find a link return it # check for the normal continue button and fail all else - with credentials failure continueButton = soup.find("input", attrs={"id": "ctl00_mainContent_FinishButton_button"}) if continueButton == None: logging.warning("NatWest cant find finish button credentials incorrect") nextButton = soup.find("input", attrs={"id": "ctl00_mainContent_NextButton_button"}) if nextButton == None: logging.warning("NatWest cant find next button either") return "credentials incorrect" # now find the form that these buttons belong to loginform = soup.find("form", attrs={"name": "aspnetForm"}) if loginform == None: logging.debug("NatWest no continue form") return "bank error" else: logging.debug("found a continue form - so clicking it") action = self.urlBase + "/" + loginform["action"] # any hidden values etc values = self.parseForm(loginform) # build the body content data = urllib.urlencode(values) self.response = {} self.response["url"] = self.ByteToHex(action) self.response["data"] = self.ByteToHex(data) self.response["method"] = "POST" self.response["step"] = 4 return "messages"
def _processCCAccount(self, raw, account_path, balance): soup = BeautifulSoup(raw) logging.debug("CC ac path - " + str(account_path) + " - end") try: if account_path != "": # delete existing current xactions logging.debug("Processing :) ") builder = StatementBuilder(self.facade, account_path, self.token) self.statementlist.append(builder) self.statementbuilder = self.statementlist[self.current_statement] # we know this is not a credit card isCCard = True # get a fixed balance somewhere?? # passed in for natwest # set up our statement self.statementbuilder.make_recent_dif_statement("NatWest-recent", "Scraper", None) # now set the final balance logging.debug("Balance - - - - - - - > " + str(balance)) self.statementbuilder.set_current_balance(balance) # now find all the recent transactions x_table = soup.find("table", attrs={"class": "ItemTable"}) if x_table != None: x_body = x_table.find("tbody") inputs = x_body.findAll("tr") # build the post values up for rows in inputs: atts = {} vals = rows.findAll("td") if vals: datebit = "" for i, val in enumerate(vals): data = self.tidy_text(val.text) if i == 0: dp = DateParser() try: atts["date"] = dp.ymd_from_date(dp.date_from_small(data)) except: atts["date"] == "" if i == 1: datebit = data[:-5] if i == 2: if data != "SALE": # only keep extra xact date for Sales datebit = "" if i == 3: if data != "": atts["display"] = " ".join(data.split()).encode("utf8") atts["extradisplay"] = datebit.encode("utf8") if i > 3: # the numbers if data != "" and data != "-": amount = self.normalise_ammount(data) if i == 4: atts["amount"] = amount atts["type"] = "Credit" if i == 5: atts["amount"] = amount atts["type"] = "Debit" if i == 5: self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1 except Exception, e: logging.exception("NatWest parsing error - " + str(e))
def processAccount(self, acCount, acName, account_path, allofit): page = self.HexToByte( allofit['body']) # save this page self.output_page("account" + str(acCount) + ".html", page) soup = BeautifulSoup(page) logging.debug('ac path - ' + str(account_path) + ' - end' ) if account_path != "": # delete existing current xactions logging.debug('Processing :) ' ) self.statementbuilder = StatementBuilder(self.facade, account_path, self.token) # need to get last statement and make a new one every time self.statementbuilder.make_recent_dif_statement('Fd-recent', 'Scraper', None) #TODO change this isVisa = False loginform=soup.find('input', attrs={'name' : 'cmd_sort_referenceAscending'}) if loginform != None: isVisa = True bal_tables=soup.findAll('table', attrs={'class' : 'fdTableBackgroundOne'}) balance_table = bal_tables[2] if balance_table <> None: vals = balance_table.findAll('td') if vals: bal = vals[1].text data = bal.replace('£', u'£'); data = data.strip(u'£') if data[-1] == 'D': data = data.replace('DB','') data = data.replace('D','') lastbal = int( float(data) * 100 ) firstbal = 0 - lastbal else: data = data.replace('CR','') data = data.replace('C','') firstbal = int( float(data) * 100 ) self.statementbuilder.set_current_balance(firstbal) logging.debug("-----------------------------*******---------------------") if isVisa: logging.debug("found visa --") acTable=soup.find('table', attrs={'class' : 'fdStatTable'}) # if no table then no new data afaik if acTable != None: datarows=acTable.findAll('tr') next = False # build the post values up atts = {} isFirst = True firstbal = 0 firstdate = "" lastbal = 0 lastdate = "" doBalance = False dp = DateParser() for rows in datarows: vals = rows.findAll('td') if vals: for i, val in enumerate(vals): if val.text: data = val.text.strip() data = unescape(data) data = unicode(data) else: data = "" if data != " ": data = data.replace(' ','') if i == 0: if data != "": try: lastdate = dp.ymd_from_date(dp.date_from_dmy(data,'/')) except: logging.warn("Invalid FD date format - probably no transactions") return if firstdate == "": firstdate = lastdate atts['date'] = lastdate if (i == 1 and not isVisa) or (i == 2 and isVisa): atts['display'] = data[0:19] atts['extradisplay'] = data[19:] if (i == 2 and not isVisa) or (i == 3 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'D') data = data.strip(u'B') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100 ) atts['type'] = 'Debit' if (i == 3 and not isVisa) or (i == 4 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'C') data = data.strip(u'R') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100 ) atts['type'] = 'Credit' if not isVisa: if i == 4: data = data.strip(u'£') if data != "": lastbal = int( float(data) * 100 ) if isFirst: isFirst = False firstbal = lastbal doBalance = True if i == 5: if doBalance: doBalance = False if data == "D": firstbal = 0 - firstbal self.statementbuilder.set_current_balance(firstbal) self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1
class ProntuarioVeiculo(object): ''' classdocs ''' def __init__(self, prontuario_html): ''' Constructor ''' self.__soup = \ BeautifulSoup(prontuario_html, fromEncoding='iso-8859-1', convertEntities=BeautifulSoup.HTML_ENTITIES) self.__prontuario = {} self._parsear_dados_veiculo() self._parsear_debitos() self._parsear_infracoes_em_autuacao() self._parsear_listagem_multas() self._parsear_historico_multas() self._parsear_ultimo_processo() self._parsear_recurso_infracao() def obter_prontuario(self): return deepcopy(self.__prontuario) def _parsear_dados_veiculo(self): tabela = self.__soup.find('div', id='div_servicos_02').table.tbody for celula in tabela.findAll('td'): dado = celula.findAll(text=True) if len(dado) == 2: self.__prontuario[dado[0].strip()] = dado[1].strip() def _parsear_debitos(self): tabela = self.__soup.find('div', id='div_servicos_03').table.tbody debitos = [] for linha in tabela.findAll('tr')[1:-1]: debito = {} texto = linha.td.findAll(text=True) if texto == None: texto = '' debito[u'Classe'] = ''.join(texto).strip() link = '' if linha.td.a != None: link = linha.td.a['href'].strip() debito[u'Link'] = link celulas = [ u'Número DetranNet', u'Vencimento', u'Valor Nominal(R$)', u'Multa(R$)', u'Juros(R$)', u'Valor Atual(R$)' ] for celula, valor in zip(celulas, linha.findAll('td')[1:]): debito[celula] = valor.string.strip() debitos.append(debito) self.__prontuario[u'Débitos'] = debitos def _parsear_infracoes_em_autuacao(self): tabela = self.__soup.find('div', id='div_servicos_10').table.tbody celula_filha = lambda tag: tag.name == 'td' and tag.table == None celulas = tabela.findAll(celula_filha)[3:] infracoes = [] for i in range(len(celulas) / 7): linha = 7 * i infracao = {} infracao[u'Número'] = celulas[linha].a.string infracao[u'Link'] = celulas[linha].a['href'].strip() infracao[u'Valor'] = celulas[linha + 1].string.strip() infracao[u'Situação'] = celulas[linha + 2].string.strip() infracao[u'Descrição 1'] = celulas[linha + 3].string.strip() infracao[u'Descrição 2'] = celulas[linha + 4].string.strip() infracao[u'Local/Complemento 1'] = celulas[linha + 5].string.strip() infracao[u'Local/Complemento 2'] = celulas[linha + 6].string if infracao[u'Local/Complemento 2'] == None: infracao[u'Local/Complemento 2'] = u'' infracao[u'Local/Complemento 2'] = infracao[ u'Local/Complemento 2'].strip() infracoes.append(infracao) self.__prontuario[u'Infrações em Autuação'] = infracoes #TODO: Implementar def _parsear_listagem_multas(self): tabela = self.__soup.find('div', id='div_servicos_04').table.tbody if tabela.tr.td.find(text=re.compile(u'Nenhuma?')): self.__prontuario[u'Listagem de Multas'] = [] return def _parsear_historico_multas(self): tabela = self.__soup.find('div', id='div_servicos_07').table.tbody celula_filha = lambda tag: tag.name == 'td' and tag.table == None celulas = tabela.findAll(celula_filha)[3:] multas = [] for i in range(len(celulas) / 7): linha = 7 * i multa = {} multa[u'Número'] = celulas[linha].a.string multa[u'Link'] = celulas[linha].a['href'].strip() multa[u'Lançamento'] = celulas[linha + 1].string.strip() multa[u'Pagamento'] = celulas[linha + 2].string.strip() multa[u'Descrição 1'] = celulas[linha + 3].string.strip() multa[u'Descrição 2'] = celulas[linha + 4].string.strip() multa[u'Local/Complemento 1'] = celulas[linha + 5].string.strip() multa[u'Local/Complemento 2'] = celulas[linha + 6].string if multa[u'Local/Complemento 2'] == None: multa[u'Local/Complemento 2'] = u'' multa[u'Local/Complemento 2'] = multa[ u'Local/Complemento 2'].strip() multas.append(multa) self.__prontuario[u'Histórico de Multas'] = multas def _parsear_ultimo_processo(self): tabela = self.__soup.find('div', id='div_servicos_11').table.tbody ultimo_processo = {} celulas = tabela.findAll('td') for celula in celulas[:5]: dado = celula.findAll(text=True) ultimo_processo[dado[0]] = dado[1] for i in range(7, len(celulas), 2): chave = celulas[i].findAll(text=True)[0] valor = celulas[i + 1].findAll(text=True)[0] ultimo_processo[chave] = valor self.__prontuario[u'Último Processo'] = ultimo_processo #TODO: Implementar def _parsear_recurso_infracao(self): tabela = self.__soup.find('div', id='div_servicos_09').table.tbody if tabela.tr.td.find(text=re.compile(u'Nenhuma?')): self.__prontuario[u'Recurso de Infração'] = [] return
def doStep4(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("natwest-pos-accounts.html", page) scrape_result = 'good' logging.info("NatWest message or bad cred check ") # if we still have the input then def bad credentials errorDiv = soup.findAll( 'input', attrs={'name': 'ctl00$mainContent$LI6PPEA_edit'}) if len(errorDiv) != 0: logging.info("NatWest defiantely bad credentials") return 'credentials incorrect' accountBLock = soup.findAll('table', attrs={'class': 'AccountTable'}) # got some acount details so all good if len(accountBLock) > 0: logging.debug("NatWest defiantely got some good accounts") return 'good' # find any link # if we find a link return it # check for the normal continue button and fail all else - with credentials failure continueButton = soup.find( 'input', attrs={'id': 'ctl00_mainContent_FinishButton_button'}) if (continueButton == None): logging.warning( "NatWest cant find finish button credentials incorrect") nextButton = soup.find( 'input', attrs={'id': 'ctl00_mainContent_NextButton_button'}) if (nextButton == None): logging.warning("NatWest cant find next button either") return 'credentials incorrect' # now find the form that these buttons belong to loginform = soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('NatWest no continue form') return 'bank error' else: logging.debug('found a continue form - so clicking it') action = self.urlBase + '/' + loginform['action'] # any hidden values etc values = self.parseForm(loginform) # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 4 return 'messages'