def get_product_pages(static,url,logger): logger.debug("In get_product_pages: " + url) # Get the svr first (it's global) lines = themortgagemeter_utils.get_page(False,'',url,logger,True).split('\n') # Now get the mortgage data if static: tree = ET.parse('static_html/tesco/Products.xml') root = tree.getroot() else: root = ET.fromstring(themortgagemeter_utils.get_page(False,'',url,logger,True)) term = str(25 * 12) for purchase in ('HousePurchase','Remortgage'): if purchase == 'HousePurchase': eligibilities = ['NFTB','NMH'] elif purchase == 'Remortgage': eligibilities = ['NRM'] for rate_type in ('FixedRate','TrackerRate'): if rate_type == 'FixedRate': mortgage_type = 'F' elif rate_type == 'TrackerRate': mortgage_type = 'T' rate_set = root.find(purchase).find(rate_type) for rate in rate_set.findall('LTV'): ltv_percent = rate.get('max') for mortgage in rate.findall('Mortgage'): #ET.dump(mortgage) #print "--------------------" rate_percent = mortgage.find('initialRate').text apr_percent = mortgage.find('APR').text svr_percent = mortgage.find('variableRate').text name = mortgage.find('name').text.split('\n')[0] initial_period = themortgagemeter_utils.get_months(name,logger) booking_fee = str(int(mortgage.find('bookingFee').text) + int(mortgage.find('productFee').text)) for eligibility in eligibilities: mc_util.handle_mortgage_insert(institution_code,mortgage_type,rate_percent,svr_percent,apr_percent,ltv_percent,initial_period,booking_fee,term,'http://www.tescobank.com/personal/finance/mortgages',eligibility,logger)
def process_page(static, base_url, url_suffix, eligibility): logger = logging.getLogger('retrieve') bsobj = themortgagemeter_utils.get_page( static, 'static_html/northernrock/First-Time-Buyer', base_url + url_suffix, logger) anchors = bsobj.find_all(attrs={'class': 'continue moreinfo'}) for anchor in anchors: url = base_url + anchor['href'] logger.info(url) anchor_bsobj = themortgagemeter_utils.get_page( static, 'static_html/northernrock/5yr_everyday_fixed_5ct5', url, logger) title = anchor_bsobj.find_all('h1')[0].string for class_str in ('fixedpanel', 'trackerpanel'): trs = anchor_bsobj.find_all('tr', 'list ' + class_str) if trs: (initial_period, mortgage_type) = process_title(title, logger) #print title #print initial_period #print mortgage_type # TODO get time period and type from title # TODO: I think this is wrong! fixedpanel is different from trackerpanel! #print trs for tr in trs: spans = tr.find_all('span') count = 0 for span in spans: # Skip the first one. #print span #print count count += 1 if count > 5: continue else: s = span.string if count == 1: if s == None: s = span.em.string rate_percent = s.split('%')[0] elif count == 2: svr_percent = s.split('%')[0] elif count == 3: apr_percent = s.split('%')[0] elif count == 4: booking_fee = s[1:].replace(',', '') elif count == 5: ltv_percent = s.split('%')[0].replace(',', '') #print spans if spans: mc_util.handle_mortgage_insert( institution_code, mortgage_type, rate_percent, svr_percent, apr_percent, ltv_percent, initial_period, booking_fee, term, url, eligibility, logger) #print 'rate_percent:' + rate_percent + ' apr_percent:' + apr_percent + ' booking_fee:' + booking_fee + ' ltv_percent:' + ltv_percent + ' mortgage_type:' + mortgage_type + ' initial_period:' + initial_period + ' svr_percent:' + svr_percent else: logger.critical('No data from url: ' + url)
def process_page(static,base_url,url_suffix,eligibility): logger = logging.getLogger('retrieve') bsobj = themortgagemeter_utils.get_page(static,'static_html/northernrock/First-Time-Buyer',base_url + url_suffix,logger) anchors = bsobj.find_all(attrs={'class' : 'continue moreinfo'}) for anchor in anchors: url = base_url + anchor['href'] logger.info(url) anchor_bsobj = themortgagemeter_utils.get_page(static,'static_html/northernrock/5yr_everyday_fixed_5ct5',url,logger) title = anchor_bsobj.find_all('h1')[0].string for class_str in ('fixedpanel','trackerpanel'): trs = anchor_bsobj.find_all('tr','list ' + class_str) if trs: (initial_period,mortgage_type) = process_title(title,logger) #print title #print initial_period #print mortgage_type # TODO get time period and type from title # TODO: I think this is wrong! fixedpanel is different from trackerpanel! #print trs for tr in trs: spans = tr.find_all('span') count = 0 for span in spans: # Skip the first one. #print span #print count count += 1 if count > 5: continue else: s = span.string if count == 1: if s == None: s = span.em.string rate_percent = s.split('%')[0] elif count == 2: svr_percent = s.split('%')[0] elif count == 3: apr_percent = s.split('%')[0] elif count == 4: booking_fee = s[1:].replace(',','') elif count == 5: ltv_percent = s.split('%')[0].replace(',','') #print spans if spans: mc_util.handle_mortgage_insert(institution_code,mortgage_type,rate_percent,svr_percent,apr_percent,ltv_percent,initial_period,booking_fee,term,url,eligibility,logger) #print 'rate_percent:' + rate_percent + ' apr_percent:' + apr_percent + ' booking_fee:' + booking_fee + ' ltv_percent:' + ltv_percent + ' mortgage_type:' + mortgage_type + ' initial_period:' + initial_period + ' svr_percent:' + svr_percent else: logger.critical('No data from url: ' + url)
def process_page(static, url, mortgage_type, eligibility): logger = logging.getLogger("retrieve") resp = themortgagemeter_utils.get_page(static, "static_html/natwest/fix_ftb.html", url, logger, tostring=True) json_obj = json.loads(resp) if json_obj["Mortgages"] == None: logger.info("URL returned nothing: " + url) return # themortgagemeter_utils.pretty_print_json(json_obj) mortgages = json_obj["Mortgages"]["mortgage"] for mortgage in mortgages: rate_percent = mortgage["initialRate"]["value"] svr_percent = mortgage["followOnRate"] apr_percent = mortgage["overallCostAPR"] ltv_percent = mortgage["LTV"] initial_period = str(int(mortgage["initialRate"]["duration"]) * 12) booking_fee = mortgage["arrangementFee"] mc_util.handle_mortgage_insert( institution_code, mortgage_type, rate_percent, svr_percent, apr_percent, ltv_percent, initial_period, booking_fee, term, url, eligibility, logger, )
def get_mortgage_page_details(static,base_url,suffix_url,logger): bsobj = themortgagemeter_utils.get_page(static,'NA',base_url + suffix_url,logger) mtgtables = bsobj.find_all(id='mtgTableData') if len(mtgtables) == 0: # We're in a product page with potentially further product pages and no mortgage info, # so give up and pass back through get_product_pages. get_product_pages(static,base_url,suffix_url,logger) return url = base_url + suffix_url # assume default of 25 years logger.info("URL:" + url) if re.match('.*fixed.*',url): mortgage_type = 'F' elif re.match('.*offset.*',url): mortgage_type = 'O' elif re.match('.*tracker.*',url): mortgage_type = 'T' else: # default to variable logger.critical("Couldn't identify url: " + url) mortgage_type = 'V' term = str(25 * 12) trs = bsobj.find_all(id='mtgTableData')[0].find_all('tbody')[0].find_all('tr') for tr in trs: tds = tr.find_all('td') logger.info(tds) for td in tds: # Row 1: tells you type of mortgage and fix period "until dd/mm/year" pass
def get_mortgage_page_details(static, base_url, suffix_url, logger): bsobj = themortgagemeter_utils.get_page(static, 'NA', base_url + suffix_url, logger) mtgtables = bsobj.find_all(id='mtgTableData') if len(mtgtables) == 0: # We're in a product page with potentially further product pages and no mortgage info, # so give up and pass back through get_product_pages. get_product_pages(static, base_url, suffix_url, logger) return url = base_url + suffix_url # assume default of 25 years logger.info("URL:" + url) if re.match('.*fixed.*', url): mortgage_type = 'F' elif re.match('.*offset.*', url): mortgage_type = 'O' elif re.match('.*tracker.*', url): mortgage_type = 'T' else: # default to variable logger.critical("Couldn't identify url: " + url) mortgage_type = 'V' term = str(25 * 12) trs = bsobj.find_all( id='mtgTableData')[0].find_all('tbody')[0].find_all('tr') for tr in trs: tds = tr.find_all('td') logger.info(tds) for td in tds: # Row 1: tells you type of mortgage and fix period "until dd/mm/year" pass
def process_page(static, url, mortgage_type, eligibility): logger = logging.getLogger('retrieve') resp = themortgagemeter_utils.get_page(static, 'static_html/natwest/fix_ftb.html', url, logger, tostring=True) json_obj = json.loads(resp) if json_obj['Mortgages'] == None: logger.info('URL returned nothing: ' + url) return #themortgagemeter_utils.pretty_print_json(json_obj) mortgages = json_obj['Mortgages']['mortgage'] for mortgage in mortgages: rate_percent = mortgage['initialRate']['value'] svr_percent = mortgage['followOnRate'] apr_percent = mortgage['overallCostAPR'] ltv_percent = mortgage['LTV'] initial_period = str(int(mortgage['initialRate']['duration']) * 12) booking_fee = mortgage['arrangementFee'] mc_util.handle_mortgage_insert(institution_code, mortgage_type, rate_percent, svr_percent, apr_percent, ltv_percent, initial_period, booking_fee, term, url, eligibility, logger)
def halifax_ftb_page(static,url,mortgage_type,eligibility,logger): logger = logging.getLogger('retrieve') bsobj = themortgagemeter_utils.get_page(static,'static_html/halifax/fixed.html',url,logger) trs = bsobj.find_all('tr') for tr in trs: mortgage_details = [] for d in tr.strings: mortgage_details.append(string.strip(d.encode('utf-8'))) if len(mortgage_details) > 19 and len(mortgage_details) < 25: if mortgage_details[3].find('%') != -1: initial_period = mortgage_details[1] if initial_period[0] == 'x': # handle special case of "dummy row" continue rate_percent = mortgage_details[3][:-1] svr_percent = mortgage_details[6].split()[0][:-1].strip('\xc2').strip('\xa0') apr_percent = mortgage_details[10].split()[0][:-1] booking_fee = mortgage_details[12][2:].replace(',','') # handle special nonsense case if re.search(r'years',initial_period) and not re.search(r'[0-9]+ years',initial_period): years = initial_period[0] initial_period = str(int(years) * 12) elif re.search(r'months',initial_period) and not re.search(r'[0-9]+ month',initial_period): initial_period = initial_period[0:2] else: initial_period = str(themortgagemeter_utils.get_months(initial_period,logger)) #print mortgage_details if len(mortgage_details[14].split('-')) > 1: ltv_percent = str(100 - int(mortgage_details[14].split('-')[0])) else: ltv_percent = str(100 - int(mortgage_details[14][0:2])) mc_util.handle_mortgage_insert(institution_code,mortgage_type,rate_percent,svr_percent,apr_percent,ltv_percent,initial_period,booking_fee,term,url,eligibility,logger)
def get_product_pages(url, mortgage_type, ltv_percent, eligibilities, logger): resp = themortgagemeter_utils.get_page(False, '', url, logger, tostring=True) json_obj = json.loads(resp) #print json_obj mortgage_list = json_obj['mortgageList'] if mortgage_list == 'none': logger.info('URL returned nothing: ' + url) return for item in mortgage_list: #print item['mortgages'] #mortgage_type #product_fee #ltv #offer #initial_rate #homeowner_variable_rate #repayment #erc #buyer_type #loan_size #overall_cost_for_comparison #payment #initial_term mortgage = item['mortgages'] #print mortgage if mortgage['initial_term'] == None: #print "continuing" continue rate_percent = mortgage['initial_rate']['rate'] svr_percent = mortgage['homeowner_variable_rate']['rate'] apr_percent = mortgage['overall_cost_for_comparison']['rate'] initial_period = str(int(float(mortgage['initial_term']) * 12)) booking_fee = mortgage['product_fee']['rate'] buyer_types = mortgage['buyer_type'] for eligibility in eligibilities: mc_util.handle_mortgage_insert(institution_code, mortgage_type, rate_percent, svr_percent, apr_percent, ltv_percent, initial_period, booking_fee, term, 'http://www.lloydsbank.com', eligibility, logger)
def chelsea_main(static,forcedelete,logger): # http://www.thechelsea.co.uk/js/mortgage-data-ref.js # get the xml file from there, then parse it, eg # http://www.thechelsea.co.uk/mortgages/mortage-product-data-0031.xml url = themortgagemeter_utils.get_page(False,'','http://www.thechelsea.co.uk/js/mortgage-data-ref.js',logger,True).split('"')[1] xml_url = url get_product_pages(static,'http://www.thechelsea.co.uk/' + xml_url,logger) mc_db.update_current(institution_code,main.today,forcedelete,logger)
def get_product_pages(static, url, logger): logger.debug("In get_product_pages: " + url) # Get the svr first (it's global) lines = themortgagemeter_utils.get_page(False, '', url, logger, True).split('\n') # Now get the mortgage data if static: tree = ET.parse('static_html/tesco/Products.xml') root = tree.getroot() else: root = ET.fromstring( themortgagemeter_utils.get_page(False, '', url, logger, True)) term = str(25 * 12) for purchase in ('HousePurchase', 'Remortgage'): if purchase == 'HousePurchase': eligibilities = ['NFTB', 'NMH'] elif purchase == 'Remortgage': eligibilities = ['NRM'] for rate_type in ('FixedRate', 'TrackerRate'): if rate_type == 'FixedRate': mortgage_type = 'F' elif rate_type == 'TrackerRate': mortgage_type = 'T' rate_set = root.find(purchase).find(rate_type) for rate in rate_set.findall('LTV'): ltv_percent = rate.get('max') for mortgage in rate.findall('Mortgage'): #ET.dump(mortgage) #print "--------------------" rate_percent = mortgage.find('initialRate').text apr_percent = mortgage.find('APR').text svr_percent = mortgage.find('variableRate').text name = mortgage.find('name').text.split('\n')[0] initial_period = themortgagemeter_utils.get_months( name, logger) booking_fee = str( int(mortgage.find('bookingFee').text) + int(mortgage.find('productFee').text)) for eligibility in eligibilities: mc_util.handle_mortgage_insert( institution_code, mortgage_type, rate_percent, svr_percent, apr_percent, ltv_percent, initial_period, booking_fee, term, 'http://www.tescobank.com/personal/finance/mortgages', eligibility, logger)
def get_product_pages(static,base_url,suffix_url,logger): logger.info("In get_product_pages: " + base_url + suffix_url) bsobj = themortgagemeter_utils.get_page(static,'NA',base_url + suffix_url,logger) pages_so_far.append(suffix_url) for anchor in bsobj.find_all('a'): href = anchor.get('href') if href and re.match('.*/products/.*',href): logger.info("HREF:" + href) if href in pages_so_far: # Already done this page. continue get_mortgage_page_details(static,base_url,href,logger)
def process_page(url,ltv,eligibility,mortgage_type,logger): # "Rates": [ # { # "Apr": 3.7, # "AssetUrl": "58305", # "Availability": "All", # "BaseRateDifferential": 0, # "BookingFee": 99, # "CashBackAmount": 0, # "Changed": "Changed", # "Eligibility": "First Time Buyer", # "EligibilityFeatures": null, # "ErcPercentage": 3, # "FeesPayable": 499, # "HasFreeLegals": false, # "HasFreeValuations": false, # "InitialRate": 1.94, # "MaxLoanAmount": 1000000, # "MaxLoanToValue": 60, # "MinLoanAmount": 25000, # "MinLoanToValue": 0, # "MonthlyRepayment": 420.93, # "MortgageType": 0, # "OverpaymentAmountAllowed": 0, # "ProductDescription": "2 Year Fixed Rate (First Time Buyer)", # "ProductFee": 400, # "RequiresExistingBorrower": false, # "RequiresFirstTimeBuyer": true, # "RequiresFlexAccount": false, # "RequiresFurtherAdvance": false, # "RequiresHomeMover": true, # "RequiresOwnSolicitor": false, # "RequiresRemortgage": false, # "ReservationFeeScale": "", # "RevertRate1": 3.99, # "SpecialProductFee": 900, # "Term": 24, # "Withdrawn": false resp = themortgagemeter_utils.get_page(False,'',url,logger,tostring=True) json_obj = json.loads(resp) mortgage_list = json_obj['Rates'] if mortgage_list == 'none': logger.info('URL returned nothing: ' + url) return for mortgage in mortgage_list: rate_percent = str(mortgage['InitialRate']) svr_percent = str(mortgage['RevertRate1']) apr_percent = str(mortgage['Apr']) initial_period = str(mortgage['Term']) booking_fee = str(int(mortgage['BookingFee'] + mortgage['SpecialProductFee'] + mortgage['FeesPayable'])) ltv_percent = str(mortgage['MaxLoanToValue']) mc_util.handle_mortgage_insert(institution_code,mortgage_type,rate_percent,svr_percent,apr_percent,ltv_percent,initial_period,booking_fee,term,'http://www.nationwide.co.uk',eligibility,logger)
def get_product_pages(static, base_url, suffix_url, logger): logger.info("In get_product_pages: " + base_url + suffix_url) bsobj = themortgagemeter_utils.get_page(static, 'NA', base_url + suffix_url, logger) pages_so_far.append(suffix_url) for anchor in bsobj.find_all('a'): href = anchor.get('href') if href and re.match('.*/products/.*', href): logger.info("HREF:" + href) if href in pages_so_far: # Already done this page. continue get_mortgage_page_details(static, base_url, href, logger)
def get_product_page(static,url): logger = logging.getLogger('retrieve') bsobj = themortgagemeter_utils.get_page(static,'static_html/first_direct/mortgage-rates',url,logger) print bsobj sections = bsobj.find_all(attrs={'class':'section'}) for section in sections: #print section #print "=============================" tbodys = section.find_all("tbody") for tbody in tbodys: print tbody trs = tbody.find_all("tr") for tr in trs: tds = tr.find_all("td") booking_fee_int = 0 count = 0 for td in tds: # assume default of 25 years term = str(25 * 12) td_text = td.text.strip().encode('utf-8') #print count #print td if count == 0: #initial_period initial_period = themortgagemeter_utils.get_months(td_text,logger) #mortgage_type F/D/T/O/V mortgage_type = mc_util.get_mortgage_type(td_text,logger) #eligibility print td_text pass elif count == 1: #ltv_percent ltv_percent = themortgagemeter_utils.get_percentage(td_text,logger) elif count == 2: #rate_percent rate_percent = themortgagemeter_utils.get_percentage(td_text,logger) elif count == 3: #svr_percent svr_percent = themortgagemeter_utils.get_percentage(td_text,logger) elif count == 4: #apr_percent apr_percent = themortgagemeter_utils.get_percentage(td_text,logger) elif count == 5: booking_fee_int = booking_fee_int + int(themortgagemeter_utils.get_money(td_text,logger)) elif count == 6: booking_fee_int = booking_fee_int + int(themortgagemeter_utils.get_money(td_text,logger)) count = count + 1 booking_fee = str(booking_fee_int) mc_util.handle_mortgage_insert(institution_code,mortgage_type,rate_percent,svr_percent,apr_percent,ltv_percent,initial_period,booking_fee,term,url,eligibility,logger)
def halifax_remortgage_page(static,url,mortgage_type,eligibility,logger): bsobj = themortgagemeter_utils.get_page(static,'static_html/halifax/remortgage-fixed-75ltv.asp',url,logger) trs = bsobj.find_all('tr') for tr in trs: mortgage_details = [] for d in tr.strings: mortgage_details.append(string.strip(d.encode('utf-8'))) #['\n', 'Term', 'Initial rate', '\xc2\xa0', 'Halifax Homeowner Variable rate thereafter', '\xc2\xa0', 'For the remainder of the term from', '\xc2\xa0', 'The overall cost for comparison is', '\xc2\xa0', 'Product fee', '\xc2\xa0', 'LTV\xc2\xa0\xc2\xa0\xc2\xa0\xc2\xa0\xc2\xa0\xc2\xa0 ', 'Early Repayment Charges until', '\xc2\xa0', 'Loan amount', '\n', 'Extra benefits', '\xc2\xa0', '\n', '\xc2\xa0', '\n'] #['\n', '2 years', '\n', '4.44%', '\n', 'Currently', ' \xc2\xa03.99%', '\n', '30/11/2014', '\n', '4.3% APR', '\n', '\xc2\xa3995', '\n', '75-80%', '\n', '30/11/2014', '\n', '\xc2\xa30-\xc2\xa31m', '\n', 'Halifax Remortgage Service*', '\n', '\n'] logger.debug(mortgage_details) if len(mortgage_details) > 19 and len(mortgage_details) < 25: if mortgage_details[3].find('%') != -1: rate_percent = mortgage_details[3][:-1] svr_percent = mortgage_details[6].split()[0][:-1].strip('\xc2').strip('\xa0') apr_percent = mortgage_details[10].split()[0][:-1] booking_fee = mortgage_details[12][2:].replace(',','') initial_period = mortgage_details[1] # handle special nonsense case if re.search(r'years',initial_period) and not re.search(r'[0-9]+ years',initial_period): years = initial_period[0] initial_period = str(int(years) * 12) elif re.search(r'months',initial_period) and not re.search(r'[0-9]+ month',initial_period): initial_period = initial_period[0:2] else: initial_period = str(themortgagemeter_utils.get_months(initial_period,logger)) ltv_percent = mortgage_details[14].split('-')[1].strip('%') mc_util.handle_mortgage_insert(institution_code,mortgage_type,rate_percent,svr_percent,apr_percent,ltv_percent,initial_period,booking_fee,term,url,eligibility,logger) elif len(mortgage_details) == 25: if mortgage_details[3].find('%') != -1: rate_percent = mortgage_details[3][:-1] svr_percent = mortgage_details[8].split()[0][:-1].strip('\xc2').strip('\xa0') apr_percent = mortgage_details[12].split()[0][:-1] booking_fee = mortgage_details[14][2:].replace(',','') initial_period = mortgage_details[1] if re.search(r'years',initial_period) and not re.search(r'[0-9]+ years',initial_period): years = initial_period[0] initial_period = str(int(years) * 12) elif re.search(r'months',initial_period) and not re.search(r'[0-9]+ month',initial_period): initial_period = initial_period[0:2] else: initial_period = str(themortgagemeter_utils.get_months(initial_period,logger)) ltv_percent = mortgage_details[16].split('-')[1].strip('%') # handle special nonsense case mc_util.handle_mortgage_insert(institution_code,mortgage_type,rate_percent,svr_percent,apr_percent,ltv_percent,initial_period,booking_fee,term,url,eligibility,logger) elif len(mortgage_details) > 3: logger.debug('Should this be handled?: %s',(mortgage_details))
def halifax_ftb_page(static, url, mortgage_type, eligibility, logger): logger = logging.getLogger('retrieve') bsobj = themortgagemeter_utils.get_page(static, 'static_html/halifax/fixed.html', url, logger) trs = bsobj.find_all('tr') for tr in trs: mortgage_details = [] for d in tr.strings: mortgage_details.append(string.strip(d.encode('utf-8'))) if len(mortgage_details) > 19 and len(mortgage_details) < 25: if mortgage_details[3].find('%') != -1: initial_period = mortgage_details[1] if initial_period[0] == 'x': # handle special case of "dummy row" continue rate_percent = mortgage_details[3][:-1] svr_percent = mortgage_details[6].split()[0][:-1].strip( '\xc2').strip('\xa0') apr_percent = mortgage_details[10].split()[0][:-1] booking_fee = mortgage_details[12][2:].replace(',', '') # handle special nonsense case if re.search(r'years', initial_period) and not re.search( r'[0-9]+ years', initial_period): years = initial_period[0] initial_period = str(int(years) * 12) elif re.search(r'months', initial_period) and not re.search( r'[0-9]+ month', initial_period): initial_period = initial_period[0:2] else: initial_period = str( themortgagemeter_utils.get_months( initial_period, logger)) #print mortgage_details if len(mortgage_details[14].split('-')) > 1: ltv_percent = str( 100 - int(mortgage_details[14].split('-')[0])) else: ltv_percent = str(100 - int(mortgage_details[14][0:2])) mc_util.handle_mortgage_insert(institution_code, mortgage_type, rate_percent, svr_percent, apr_percent, ltv_percent, initial_period, booking_fee, term, url, eligibility, logger)
def get_product_pages(static,base_url,ext): logger = logging.getLogger('retrieve') bsobj = themortgagemeter_utils.get_page(static,'static_html/hsbc/savings-accounts.html',base_url + ext,logger) # foreach item in the doormatCol, in the ul, get each li's a element href attribute. doormatCols = bsobj.find_all(attrs={'class' : 'doormatCol'}) for d in doormatCols: anchors = d.find_all('a') for a in anchors: href = a.get('href') if href and re.match('.*savings-accounts/.*',href): url = base_url + href savings_data = savings_util.get_savings_data_object() # Set online to and branch to default to Y for HSBC savings_data['online'] = 'Y' savings_data['branch'] = 'Y' #print savings_data get_product_page_interest_rates(url + '/interest-rates',savings_data) get_product_page_details(url + '/details',savings_data)
def get_product_pages(static, base_url, ext): logger = logging.getLogger('retrieve') bsobj = themortgagemeter_utils.get_page( static, 'static_html/hsbc/savings-accounts.html', base_url + ext, logger) # foreach item in the doormatCol, in the ul, get each li's a element href attribute. doormatCols = bsobj.find_all(attrs={'class': 'doormatCol'}) for d in doormatCols: anchors = d.find_all('a') for a in anchors: href = a.get('href') if href and re.match('.*savings-accounts/.*', href): url = base_url + href savings_data = savings_util.get_savings_data_object() # Set online to and branch to default to Y for HSBC savings_data['online'] = 'Y' savings_data['branch'] = 'Y' #print savings_data get_product_page_interest_rates(url + '/interest-rates', savings_data) get_product_page_details(url + '/details', savings_data)
def get_product_pages(url,mortgage_type,ltv_percent,eligibilities,logger): resp = themortgagemeter_utils.get_page(False,'',url,logger,tostring=True) json_obj = json.loads(resp) #print json_obj mortgage_list = json_obj['mortgageList'] if mortgage_list == 'none': logger.info('URL returned nothing: ' + url) return for item in mortgage_list: #print item['mortgages'] #mortgage_type #product_fee #ltv #offer #initial_rate #homeowner_variable_rate #repayment #erc #buyer_type #loan_size #overall_cost_for_comparison #payment #initial_term mortgage = item['mortgages'] #print mortgage if mortgage['initial_term'] == None: #print "continuing" continue rate_percent = mortgage['initial_rate']['rate'] svr_percent = mortgage['homeowner_variable_rate']['rate'] apr_percent = mortgage['overall_cost_for_comparison']['rate'] initial_period = str(int(float(mortgage['initial_term']) * 12)) booking_fee = mortgage['product_fee']['rate'] buyer_types = mortgage['buyer_type'] for eligibility in eligibilities: mc_util.handle_mortgage_insert(institution_code,mortgage_type,rate_percent,svr_percent,apr_percent,ltv_percent,initial_period,booking_fee,term,'http://www.lloydsbank.com',eligibility,logger)
def get_product_page_interest_rates(url,savings_data): logger = logging.getLogger('retrieve ' + url) bsobj = themortgagemeter_utils.get_page(False,'',url,logger) #logger.info(url) #logger.info(bsobj) if re.match('.*isa.*',url): savings_data['isa'] = 'Y' for t in bsobj.find_all('table'): #logger.info("TABLE")# logger.info(t) # Get all tables, then match on summary == "Interest rates:.*", and set up variables accordingly. summary = t.get('summary').encode('utf-8').lower() if summary: # Set up data for this page summary_info = re.match('.*interest rates: (.*)',summary).group(1) #logger.info("summary info: " + summary_info) if summary_info in ("cash e-isa#"): savings_data['isa'] = 'Y' elif summary_info in ("fixed rate saver - monthly interest"): savings_data['variability'] = 'F' savings_data['interest_paid'] = 'M' elif summary_info in ("fixed rate saver - annual interest"): savings_data['variability'] = 'F' savings_data['interest_paid'] = 'Y' elif "regular saver" in summary_info: savings_data['regular_saver'] = 'Y' savings_data['interest_paid'] = 'Y' elif "online bonus" in summary_info: savings_data['bonus'] = 'Y' savings_data['branch'] = 'N' savings_data['bonus_frequency_period'] = '1' savings_data['bonus_frequency_type'] = 'M' # skip bonus for HSBC- it's complicated - probably needs its own function TODO continue elif "flexible saver" in summary_info: savings_data['variability'] = 'V' else: themortgagemeter_utils.record_alert('NEED TO HANDLE: ' + summary_info,logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) exit() tr_count = 0 for tr in t.find_all('tr'): # This is a new savings product, so clone the data at this point and use that from here. this_savings_data = savings_data.copy() #logger.info("TR " + str(tr_count)) #logger.info(tr) if this_savings_data['bonus'] == 'Y': #print "BONUS" #print tr pass if this_savings_data['regular_saver'] == 'Y': td_count = -1 else: td_count = 0 if tr_count >= 1: # If tax-free, this will be true for td in tr.find_all('td'): td_style = td.get('style') if td_style != None: td_style = td_style.lower().encode('utf-8').translate(None, ' ') if td_style == 'vertical-align:middle': continue #logger.info("TD" + str(td_count)) #logger.info(tr_count) #logger.info(td_count) logger.info(td) v = td.text.encode('utf-8').lower().strip() if td_count == 0: #logger.info(this_savings_data['regular_saver']) if this_savings_data['regular_saver'] == 'Y': logger.info('regular_saver: ' + v) this_savings_data['regular_saver_min_amt'] = v.split()[0][2:] this_savings_data['regular_saver_max_amt'] = v.split()[2][2:] if v.split()[4] == "month": this_savings_data['regular_saver_frequency_period'] = '1' this_savings_data['regular_saver_frequency_type'] = 'M' else: themortgagemeter_utils.record_alert('ERROR: reg saver not parsed: ' + v,logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) exit() else: # if it's got a + at the end, it's a min, if it's "up to" it's a max. res = savings_util.get_money_range(v,logger) this_savings_data['min_amt'] = res[0] this_savings_data['max_amt'] = res[1] # TODO: remove this section #if re.match('^.*\+$',v): # money_val = themortgagemeter_utils.get_money(v,logger) # this_savings_data['min_amt'] = money_val #elif re.match('^.*up to.*$',v) or re.match('^.*under.*$',v): # money_val = themortgagemeter_utils.get_money(v,logger) # this_savings_data['max_amt'] = money_val # this_savings_data['min_amt'] = 0 #elif re.match('^.* - .*$',v): # this_savings_data['min_amt'] = v.split()[0][2:].translate(None,',') # this_savings_data['max_amt'] = v.split()[2][2:].translate(None,',') #else: # #logger.info(t) #logger.info('value not handled: ' + v) # themortgagemeter_utils.record_alert('ERROR: value wrong: ' + v,logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) # exit() elif td_count == 1: # we don't bother with net_percent pass elif td_count == 2: # gross % this_savings_data['gross_percent'] = v elif td_count == 3: this_savings_data['aer_percent'] = v td_count += 1 # Some trs have no tds; we ignore those. if td_count > 0: # Now store this product # TODO: fixed savings? logger.info(this_savings_data) isa = this_savings_data['isa'] regular_saver = this_savings_data['regular_saver'] regular_saver_frequency_period = this_savings_data['regular_saver_frequency_period'] regular_saver_frequency_type = this_savings_data['regular_saver_frequency_type'] regular_saver_min_amt = this_savings_data['regular_saver_min_amt'] regular_saver_max_amt = this_savings_data['regular_saver_max_amt'] bonus = this_savings_data['bonus'] bonus_frequency_period = this_savings_data['bonus_frequency_period'] bonus_frequency_type = this_savings_data['bonus_frequency_type'] online = this_savings_data['online'] branch = this_savings_data['branch'] variability = this_savings_data['variability'] min_amt = this_savings_data['min_amt'] max_amt = this_savings_data['max_amt'] gross_percent = this_savings_data['gross_percent'] aer_percent = this_savings_data['aer_percent'] interest_paid = this_savings_data['interest_paid'] child = this_savings_data['child'] savings_period = this_savings_data['savings_period'] savings_util.handle_savings_insert(institution_code, isa, regular_saver, regular_saver_frequency_period, regular_saver_frequency_type, regular_saver_min_amt, regular_saver_max_amt, bonus, bonus_frequency_period, bonus_frequency_type, online, branch, variability, savings_period, min_amt, max_amt, gross_percent, aer_percent, child, interest_paid, url, logger) else: tr_count += 1 continue tr_count += 1 else: #print url #print bsobj exit()
def get_product_page_details(url,savings_data): logger = logging.getLogger('retrieve') #logger.info(url) bsobj = themortgagemeter_utils.get_page(False,'',url,logger)
def get_product_pages(static,url,logger): logger.debug("In get_product_pages: " + url) # Get the svr first (it's global) lines = themortgagemeter_utils.get_page(False,'','http://www.thechelsea.co.uk/js/mortgage-finder.js',logger,True).split('\n') for line in lines: if re.match(r'^var chelseaSVR = "[^%]*%".*',line) != None: svr_percent = re.match(r'^var chelseaSVR = "([^%]*)%".*$',line).group(1) break # Now get the mortgage data if static: tree = ET.parse('static_html/chelsea/mortage-product-data-0031.xml') root = tree.getroot() else: root = ET.fromstring(themortgagemeter_utils.get_page(False,'',url,logger,True)) term = str(25 * 12) for product in root.findall('product'): apr_percent = product.get('apr').split('%')[0] rate_percent = product.get('interestRate').split('%')[0] # No svr supplied, take apr ltv_percent = product.get('maxLTV').split('%')[0] mortgage_type_raw = product.get('mortgageType') name = product.get('name') booking_fee = product.get('completionFee') if booking_fee == '': booking_fee = '0' existing_borrower = product.get('existingBorrower') new_borrower = product.get('newBorrower') first_time_buyer = product.get('firstTimeBuyer') moving_home = product.get('movingHome') remortgaging = product.get('remortgaging') # Gathered data, now let's marshall before submitting. if mortgage_type_raw == 'fixed': mortgage_type = 'F' elif mortgage_type_raw == 'fixedoffset': mortgage_type = 'F' elif mortgage_type_raw == 'ftbfixed': mortgage_type = 'F' elif mortgage_type_raw == 'ftbfixedoffset': mortgage_type = 'F' elif mortgage_type_raw == 'fixedtracker': # Presumably fixed, then a tracker?? mortgage_type = 'F' elif mortgage_type_raw == 'tracker': mortgage_type = 'T' elif mortgage_type_raw == 'trackeroffset': mortgage_type = 'T' elif mortgage_type_raw == 'offset': mortgage_type = 'T' elif mortgage_type_raw == 'mixedoffset': mortgage_type = 'T' elif mortgage_type_raw == 'rollover': # rollover? no example, but exists in the docs #print 'rollover' #ET.dump(product) mortgage_type = 'T' elif mortgage_type_raw == 'mixed': # WTF is mixed? mortgage_type = 'T' else: # default to variable #print mortgage_type_raw mortgage_type = 'V' # Get a mortgage eligibility dictionary to submit. mortgage_eligibility_dict = mc_util.get_mortgage_eligibility_dict() if existing_borrower == 'Y': mortgage_eligibility_dict['existing_customer'] = 'B' if new_borrower == 'Y': mortgage_eligibility_dict['moving_home'] = 'B' if first_time_buyer == 'Y': mortgage_eligibility_dict['ftb'] = 'B' if moving_home == 'Y': mortgage_eligibility_dict['moving_home'] = 'B' if remortgaging == 'Y': mortgage_eligibility_dict['remortgage']= 'B' eligibilities = mc_util.validate_eligibility_dict(mortgage_eligibility_dict,[]) # use get_months to determine period initial_period = themortgagemeter_utils.get_months(name,logger) #ET.dump(product) #print eligibilities #print initial_period #print mortgage_eligibility_dict for eligibility in eligibilities: mc_util.handle_mortgage_insert(institution_code,mortgage_type,rate_percent,svr_percent,apr_percent,ltv_percent,initial_period,booking_fee,term,url,eligibility,logger)
def get_product_pages(static,base_url,suffix,mortgage_type,href_re): logger = logging.getLogger('retrieve') bsobj = themortgagemeter_utils.get_page(static,'static_html/skipton/fixed_rate_mortgages.html',base_url + suffix,logger) term = str(25 * 12) #print bsobj anchors = bsobj.find_all(href=href_re) for anchor in anchors: #print anchor # Get from the anchor the ltv and the term link = anchor.get('href') url = base_url + link # Still to get: rate_percent = 0 svr_percent = 0 apr_percent = 0 booking_fee = 0 application_fee = 0 # eligibilities - first time buyers have own page, so all others? eligibilities = ['NMH','NRM','ERM','EMH','EBM','EED'] #print link if re.search(fr_re,link): initial_period = str(int(re.search(fr_re,link).group(1)) * 12) ltv_percent = str(int(re.search(fr_re,link).group(2))) # Now go to link subpage_bsobj = themortgagemeter_utils.get_page(static,'N/A',url,logger) table = subpage_bsobj.find_all(attrs={'id' : 'centralContent'},limit=1)[0].find_all('table',limit=1)[0] #print '===================================================' #print table tr_count = 0 for tr in table.find_all('tr'): tr_count += 1 if tr_count == 3: rate_percent = themortgagemeter_utils.get_percentage(tr.find_all('td')[1].string,logger) elif tr_count == 4: svr_percent = themortgagemeter_utils.get_percentage(tr.find_all('td')[1].string,logger) elif tr_count == 5: apr_percent = themortgagemeter_utils.get_percentage(tr.find_all('td')[0].string,logger) elif tr_count == 7: application_fee = tr.find_all('td')[0].string.encode('utf_8')[2:].replace(',','') elif tr_count == 8: booking_fee = tr.find_all('td')[0].string.encode('utf_8')[2:].replace(',','') elif re.search(tracker_re,link): initial_period = str(int(re.search(tracker_re,link).group(1)) * 10) ltv_percent = str(int(re.search(tracker_re,link).group(2))) # Now go to link subpage_bsobj = themortgagemeter_utils.get_page(static,'N/A',url,logger) #print subpage_bsobj table = subpage_bsobj.find_all(attrs={'id' : 'centralContent'},limit=1)[0].find_all('table',limit=1)[0] #print '===================================================' #print table tr_count = 0 for tr in table.find_all('tr'): tr_count += 1 if tr_count == 3: rate_percent = themortgagemeter_utils.get_percentage(tr.find_all('td')[0].string,logger) elif tr_count == 4: svr_percent = themortgagemeter_utils.get_percentage(tr.find_all('td')[1].string,logger) elif tr_count == 5: apr_percent = themortgagemeter_utils.get_percentage(tr.find_all('td')[0].string,logger) elif tr_count == 7: application_fee = tr.find_all('td')[0].string.encode('utf_8')[2:].replace(',','') elif tr_count == 8: booking_fee = tr.find_all('td')[0].string.encode('utf_8')[2:].replace(',','') elif re.search(discount_re,link): initial_period = str(int(re.search(discount_re,link).group(1)) * 10) ltv_percent = str(int(re.search(discount_re,link).group(2))) # Now go to link subpage_bsobj = themortgagemeter_utils.get_page(static,'N/A',url,logger) #print subpage_bsobj table = subpage_bsobj.find_all(attrs={'id' : 'centralContent'},limit=1)[0].find_all('table',limit=1)[0] #print '===================================================' #print table tr_count = 0 for tr in table.find_all('tr'): tr_count += 1 if tr_count == 3: rate_percent = themortgagemeter_utils.get_percentage(tr.find_all('td')[1].string,logger) elif tr_count == 4: svr_percent = themortgagemeter_utils.get_percentage(tr.find_all('td')[1].string,logger) elif tr_count == 5: apr_percent = themortgagemeter_utils.get_percentage(tr.find_all('td')[0].string,logger) elif tr_count == 7: application_fee = tr.find_all('td')[0].string.encode('utf_8')[2:].replace(',','') elif tr_count == 8: booking_fee = tr.find_all('td')[0].string.encode('utf_8')[2:].replace(',','') elif re.search(ftb_re,link): themortgagemeter_utils.record_alert('ERROR: SKIPTON first time buyer seen for the first time',logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) continue else: raise Exception("Unhandled link " + url,'') # set up the booking fee # Sometimes it's "No Fee" on the page if booking_fee.strip() == "Fee": booking_fee = "0" if application_fee.strip() == "Fee": application_fee = "0" booking_fee = str(int(booking_fee) + int(application_fee)) for eligibility in eligibilities: mc_util.handle_mortgage_insert(institution_code,mortgage_type,rate_percent,svr_percent,apr_percent,ltv_percent,initial_period,booking_fee,term,url,eligibility,logger)
def process_page(url, logger): #var mortgages = [ #{ #data: "65543", #name: "2 year fixed", #offer: " ", #customer: "Existing Customer", #type: "Fixed Rate", #lowltv: 0, #highltv: 60, #initialrate: 2.19, #until: 2, #rateafter: 4.95, #apr: 4.6, #fee: 499, #minloan: 1, #maxloan: 1000000, #links: "/personal/mortgages/all-our-mortgages/fixed-rate-mortgages/mortgages-fixed-rate-2year-60ltv"},{ #[...] #data: "655434", #name: "3 year fixed - Fee Offer ", #offer: " ", #customer: "First Time Buyer", #type: "Fixed Rate", #lowltv: 90, #highltv: 95, #initialrate: 4.99, #until: 3, #rateafter: 4.95, #apr: 5.2, #fee: 0, #minloan: 1, #maxloan: 1000000, #links: "/personal/mortgages/all-our-mortgages/fixed-rate-mortgages/mortgages-three-year-fixed-rate-95ltv"} #] resp = themortgagemeter_utils.get_page(False, '', url, logger, tostring=True) # Tidy up json # http://stackoverflow.com/questions/4033633/handling-lazy-json-in-python-expecting-property-name resp = re.sub(r"{\s*'?(\w)", r'{"\1', resp) resp = re.sub(r",\s*'?(\w)", r',"\1', resp) resp = re.sub(r"(\w)'?\s*:", r'\1":', resp) resp = re.sub(r":\s*'(\w+)'\s*([,}])", r':"\1"\2', resp) json_obj = json.loads(resp[16:]) print json_obj #mortgage_list = json_obj['mortgages'] #if mortgage_list == 'none': # logger.info('URL returned nothing: ' + url) # return for mortgage in json_obj: customer = mortgage['customer'] if customer == "Existing Customer": eligibilities = ("EMH", "EBM", "EDE", "EED") elif customer == "First Time Buyer": eligibilities = ("NFTB", ) elif customer == "New Customer": eligibilities = ("NRM", "NMH") else: raise Exception('Unrecognised eligibility: ' + eligibility, eligibility, l) mortgage_type = mc_util.get_mortgage_type(mortgage['name'], logger) rate_percent = str(mortgage['initialrate']) svr_percent = str(mortgage['rateafter']) apr_percent = str(mortgage['apr']) initial_period = str(int(mortgage['until'] * 12.0)) booking_fee = str(mortgage['fee']) ltv_percent = str(mortgage['highltv']) for eligibility in eligibilities: print eligibility mc_util.handle_mortgage_insert(institution_code, mortgage_type, rate_percent, svr_percent, apr_percent, ltv_percent, initial_period, booking_fee, term, 'http://www.nationwide.co.uk', eligibility, logger)
def process_more_info_page(savings_data,url,logger): bsobj = themortgagemeter_utils.get_page(False,'static_html/halifax/savings-accounts.html',url,logger) #print bsobj savings_array = [] #print "Passed in:" #print savings_data print url if savings_data['isa'] == 'Y': for i1 in bsobj.find_all("h2",text="Summary box"): for i2 in i1.parent(): if i2.find_all("table") != []: tabs = i2.find_all("table") if re.match(".*isa-saver-fixed.*",url): if len(tabs) != 2: themortgagemeter_utils.record_alert('ERROR: too many tabs in isa',logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) exit() else: tabs.pop(0) for tab in tabs: tbody = tab.find_all("tbody")[0] trs = tbody.find_all("tr") for tr in trs: savings_data_tmp = savings_data.copy() tds = tr.find_all("td") savings_data_tmp['savings_period'] = themortgagemeter_utils.get_months(tds[0].text.strip().encode('utf-8'),logger) savings_data_tmp['aer_percent'] = themortgagemeter_utils.get_percentage(tds[1].text.strip().encode('utf-8'),logger) savings_data_tmp['gross_percent'] = savings_data_tmp['aer_percent'] savings_array.append(savings_data_tmp) else: if len(tabs) > 1: #print tabs themortgagemeter_utils.record_alert('ERROR: too many tabs in isa',logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) exit() for tab in tabs: #print tab for tr in tab.find_all("tr"): ths = tr.find_all("th") tds = tr.find_all("td") if len(ths) > 0 and len(tds) > 0: th = tr.find_all("th")[0] td = tr.find_all("td")[0] th_text = th.text.lower() td_text = td.text.lower() if re.match('interest rates.*',th_text): #print "IR:" + td_text pc = themortgagemeter_utils.get_percentage(td_text,logger) savings_data_tmp = savings_data.copy() savings_data_tmp['gross_percent'] = pc savings_data_tmp['aer_percent'] = pc savings_array.append(savings_data_tmp) else: if len(ths) == 0 and len(tds) > 0: td1 = tds[0] td2 = tds[1] td1_text = td1.text.lower() td2_text = td2.text.lower() if re.match('interest rates.*',td1_text): pc = themortgagemeter_utils.get_percentage(td2_text,logger) savings_data_tmp = savings_data.copy() savings_data_tmp['gross_percent'] = pc savings_data_tmp['aer_percent'] = pc savings_array.append(savings_data_tmp) else: themortgagemeter_utils.record_alert('ERROR: unhandled case: ' + url,logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) exit() elif re.match('.*fixed-online-saver.*',url) or re.match('.*tracker-bond.*',url) or re.match('.*fixed-saver.*',url): if re.match('.*fixed-online-saver.*',url) or re.match('.*fixed-saver.*',url): #print bsobj code = "FOS" i1s = bsobj.find_all("h3",text="Current Rates") if i1s== []: i1s = bsobj.find_all("h3",text="Current rates") elif re.match('.*tracker-bond.*',url): #print bsobj code = "TB" i1s = [] res = bsobj.find_all("h4") for i in res: #print i.text if i.text == "Current rates and apply": i1s.append(i) break if i1s == []: themortgagemeter_utils.record_alert('No items from expected h3/4 match!',logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) for i1 in i1s: for i2 in i1.parent(): tbodys = i2.find_all("tbody") # if this is tracker bond, discard the first table if len(tbodys) == 0: continue if code == "TB": ok = False for tbody in tbodys: for tr in tbody.find_all("tr"): tds = tr.find_all("td") if tds[0].text == "Term": ok = True if not ok: continue for tbody in tbodys: tr_count = -1 table_savings_period = "unset" for tr in tbody.find_all("tr"): tr_count = tr_count + 1 if code == "TB" and tr_count == 0: # skip the first row continue # clone the savings_data ready to write to savings_data_tmp = savings_data.copy() # First td is time only on first row for TB if code == "TB" and tr_count > 1: td_count = 1 else: td_count = 0 if code == "TB" and tr_count > 1: if table_savings_period == "unset": themortgagemeter_utils.record_alert('ERROR: table_savings_period should not be unset',logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) exit() savings_data_tmp['savings_period'] = table_savings_period for td in tr.find_all("td"): # 0 - term # 1 - balance # 2 - Gross # 3 - AER # 4 - NET (ignore) # Ignore remainder of cols text = td.text.lower().strip().encode('utf-8') if td_count == 0: # store this in a variable for use on next row if necessary table_savings_period = themortgagemeter_utils.get_months(text,logger) savings_data_tmp['savings_period'] = table_savings_period elif td_count == 1: res = savings_util.get_money_range(text,logger) savings_data_tmp['min_amt'] = res[0] savings_data_tmp['max_amt'] = res[1] elif td_count == 2: savings_data_tmp['gross_percent'] = themortgagemeter_utils.get_percentage(text,logger) elif td_count == 3: savings_data_tmp['aer_percent'] = themortgagemeter_utils.get_percentage(text,logger) # and then break out break td_count = td_count + 1 savings_array.append(savings_data_tmp) elif re.match('.*/online-saver/',url): # TODO: need to set this for other types savings_data['interest_paid'] = 'Y' #print bsobj # get the apr class element, as that contains the text we need apr = bsobj.find_all(attrs={'class':'apr'})[0].parent.parent.text.encode('utf-8') # split this line by \n apr = apr.split('\n') lines = [] for l in apr: if re.match('.* or [0-9].*',l): for l2 in l.split(' or ',1): lines.append(l2) else: lines.append(l) while '' in lines: lines.remove('') for l in lines: # copy savings_data_tmp = savings_data.copy() #print l # get percentage savings_data_tmp['gross_percent'] = themortgagemeter_utils.get_percentage(l,logger) savings_data_tmp['aer_percent'] = savings_data_tmp['gross_percent'] # get_money range res = savings_util.get_money_range(l,logger) savings_data_tmp['min_amt'] = res[0] savings_data_tmp['max_amt'] = res[1] # append to savings_array savings_array.append(savings_data_tmp) elif re.match('.*/regular-saver/',url): # TODO: need to set this for other types savings_data['interest_paid'] = 'Y' savings_data['regular_saver_frequency_period'] = '1' savings_data['regular_saver_frequency_type'] = 'M' savings_data['regular_saver'] = 'Y' # Always fixed savings_data['variability'] = 'F' #print bsobj # get the apr class element, as that contains the text we need apr = bsobj.find_all(attrs={'class':'apr'})[0].parent.parent.text.encode('utf-8') # split this line by \n apr = apr.split('\n') lines = [] for l in apr: if re.match('.* or [0-9].*',l): for l2 in l.split(' or ',1): lines.append(l2) else: lines.append(l) while '' in lines: lines.remove('') for l in lines: # copy savings_data_tmp = savings_data.copy() # get percentage savings_data_tmp['gross_percent'] = themortgagemeter_utils.get_percentage(l,logger) if savings_data_tmp['gross_percent'] == '': # abandon ship! continue savings_data_tmp['aer_percent'] = savings_data_tmp['gross_percent'] # Hard-code to 25-250 for now, this seems standard savings_data_tmp['regular_saver_min_amt'] = '25' savings_data_tmp['regular_saver_max_amt'] = '250' # append to savings_array savings_array.append(savings_data_tmp) elif re.match('.*/everyday-saver/',url): # This one's quite simple (I think) # TODO: need to set this for other types savings_data['interest_paid'] = 'Y' #print bsobj # get the apr class element, as that contains the text we need apr = bsobj.find_all(attrs={'class':'apr'})[0].parent.parent.text.encode('utf-8') #print apr # split this line by \n apr = apr.split('\n') lines = [] for l in apr: if re.match('.*gross.*',l): lines.append(l) while '' in lines: lines.remove('') for l in lines: # copy savings_data_tmp = savings_data.copy() #print l # get percentage savings_data_tmp['gross_percent'] = themortgagemeter_utils.get_percentage(l,logger) savings_data_tmp['aer_percent'] = savings_data_tmp['gross_percent'] # TODO: bonus_frequency_period set to 1, or get from data? # append to savings_array savings_array.append(savings_data_tmp) elif re.match('.*/branch-accounts/.*',url): return savings_array else: logger.info('unhandled:' + url) exit() if savings_array == []: themortgagemeter_utils.record_alert('ERROR: returning nothing from a page',logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) exit() # Return the savings_array logger.info('returning savings_array:' + str(savings_array)) return savings_array
def get_product_pages(static, base_url, ext_url, logger): url = base_url + ext_url urls_seen = [] bsobj = themortgagemeter_utils.get_page( static, 'static_html/halifax/savings-accounts.html', url, logger) # let's see how much info we can extract from the page # Get all the sortable tables and divine as much info as possible from that. sortable_tables = doormatCols = bsobj.find_all( attrs={'class': 'sortableTable'}) for table in sortable_tables: #print table for tr in table.find_all('tr'): td_idx = 0 savings_data = savings_util.get_savings_data_object() for td in tr.find_all('td'): td_text = td.text.encode('utf-8').strip().lower() if td_idx == 0: # title of account - Junior == child #re.match('/product/A[0-9]+.*',href) ##0 ##<td style="text-align: left;"><a href="/savings/accounts/cash-isas/isa-saver-online/">ISA Saver Online</a></td> ##1 ##<td style="text-align: left;"><strong class="apr">1.35%</strong> tax free/AER variable including 12 month fixed bonus of<strong> </strong>1.10%</td> ##2 ##<td>£1</td> ##3 ##<td>Variable</td> ##4 ##<td>Unlimited</td> ##5 ##<td style="text-align: center;"><img alt="Online" src="/common/images/icons/mousegrey.gif" title="Online"/></td> ##6 ##<td><a href="/savings/accounts/cash-isas/isa-saver-online/"><img alt="Find out more" src="/common/images/Buttons/primary_find_out_more.gif"/></a></td> #print "0: " + td_text if re.match('^.*isa.*$', td_text): savings_data['isa'] = 'Y' if re.match('^.*junior.*$', td_text): savings_data['child'] = 'Y' elif td_idx == 1: #print "1: " + td_text # We don't bother with this at the moment - TODO - sort this out #if re.match('.*bonus.*',td_text): # savings_data['bonus'] = 'Y' pass elif td_idx == 2: #print "2: " + td_text # minimum investment, max is always infinity min_amt = themortgagemeter_utils.get_money(td_text, logger) savings_data['min_amt'] = min_amt elif td_idx == 3: #print "3: " + td_text # Variable/Fixed if re.match('.*variable.*', td_text): savings_data['variability'] = 'V' elif re.match('.*fixed.*', td_text): savings_data['variability'] = 'F' else: themortgagemeter_utils.record_alert( 'ERROR: unknown variability: ' + td_text, logger, themortgagemeter_db.db_connection, themortgagemeter_db.cursor) exit() elif td_idx == 4: #print "4: " + td_text # Let's assume we'll get this info from the sub-page. # Withdrawals allowed: "None, by closure only", "Unlimited", "None, until child is 18" pass elif td_idx == 5: #print "5: " + td_text for img in td.find_all('img'): title = img.get('title').lower().strip() if title == 'online': savings_data['online'] = 'Y' elif title == 'branch': savings_data['branch'] = 'Y' # I'm going to ignore "phone" elif title == 'phone': pass elif td_idx == 6: #print "6: " + td_text # more details link new_url = base_url + td.find_all('a')[0].get('href') if new_url in urls_seen: continue savings_array = process_more_info_page( savings_data, new_url, logger) print new_url for this_savings_data in savings_array: # insert savings here TODO. print this_savings_data isa = this_savings_data['isa'] regular_saver = this_savings_data['regular_saver'] regular_saver_frequency_period = this_savings_data[ 'regular_saver_frequency_period'] regular_saver_frequency_type = this_savings_data[ 'regular_saver_frequency_type'] regular_saver_min_amt = this_savings_data[ 'regular_saver_min_amt'] regular_saver_max_amt = this_savings_data[ 'regular_saver_max_amt'] bonus = this_savings_data['bonus'] bonus_frequency_period = this_savings_data[ 'bonus_frequency_period'] bonus_frequency_type = this_savings_data[ 'bonus_frequency_type'] online = this_savings_data['online'] branch = this_savings_data['branch'] variability = this_savings_data['variability'] min_amt = this_savings_data['min_amt'] max_amt = this_savings_data['max_amt'] gross_percent = this_savings_data['gross_percent'] aer_percent = this_savings_data['aer_percent'] interest_paid = this_savings_data['interest_paid'] child = this_savings_data['child'] savings_period = this_savings_data['savings_period'] savings_util.handle_savings_insert( institution_code, isa, regular_saver, regular_saver_frequency_period, regular_saver_frequency_type, regular_saver_min_amt, regular_saver_max_amt, bonus, bonus_frequency_period, bonus_frequency_type, online, branch, variability, savings_period, min_amt, max_amt, gross_percent, aer_percent, child, interest_paid, url, logger) urls_seen.insert(0, new_url) else: themortgagemeter_utils.record_alert( 'ERROR: too many tds in tr: ' + tr, logger, themortgagemeter_db.db_connection, themortgagemeter_db.cursor) exit() td_idx = td_idx + 1
def process_more_info_page(savings_data, url, logger): bsobj = themortgagemeter_utils.get_page( False, 'static_html/halifax/savings-accounts.html', url, logger) #print bsobj savings_array = [] #print "Passed in:" #print savings_data print url if savings_data['isa'] == 'Y': for i1 in bsobj.find_all("h2", text="Summary box"): for i2 in i1.parent(): if i2.find_all("table") != []: tabs = i2.find_all("table") if re.match(".*isa-saver-fixed.*", url): if len(tabs) != 2: themortgagemeter_utils.record_alert( 'ERROR: too many tabs in isa', logger, themortgagemeter_db.db_connection, themortgagemeter_db.cursor) exit() else: tabs.pop(0) for tab in tabs: tbody = tab.find_all("tbody")[0] trs = tbody.find_all("tr") for tr in trs: savings_data_tmp = savings_data.copy() tds = tr.find_all("td") savings_data_tmp[ 'savings_period'] = themortgagemeter_utils.get_months( tds[0].text.strip().encode('utf-8'), logger) savings_data_tmp[ 'aer_percent'] = themortgagemeter_utils.get_percentage( tds[1].text.strip().encode('utf-8'), logger) savings_data_tmp[ 'gross_percent'] = savings_data_tmp[ 'aer_percent'] savings_array.append(savings_data_tmp) else: if len(tabs) > 1: #print tabs themortgagemeter_utils.record_alert( 'ERROR: too many tabs in isa', logger, themortgagemeter_db.db_connection, themortgagemeter_db.cursor) exit() for tab in tabs: #print tab for tr in tab.find_all("tr"): ths = tr.find_all("th") tds = tr.find_all("td") if len(ths) > 0 and len(tds) > 0: th = tr.find_all("th")[0] td = tr.find_all("td")[0] th_text = th.text.lower() td_text = td.text.lower() if re.match('interest rates.*', th_text): #print "IR:" + td_text pc = themortgagemeter_utils.get_percentage( td_text, logger) savings_data_tmp = savings_data.copy() savings_data_tmp['gross_percent'] = pc savings_data_tmp['aer_percent'] = pc savings_array.append(savings_data_tmp) else: if len(ths) == 0 and len(tds) > 0: td1 = tds[0] td2 = tds[1] td1_text = td1.text.lower() td2_text = td2.text.lower() if re.match('interest rates.*', td1_text): pc = themortgagemeter_utils.get_percentage( td2_text, logger) savings_data_tmp = savings_data.copy( ) savings_data_tmp[ 'gross_percent'] = pc savings_data_tmp[ 'aer_percent'] = pc savings_array.append( savings_data_tmp) else: themortgagemeter_utils.record_alert( 'ERROR: unhandled case: ' + url, logger, themortgagemeter_db.db_connection, themortgagemeter_db.cursor) exit() elif re.match('.*fixed-online-saver.*', url) or re.match( '.*tracker-bond.*', url) or re.match('.*fixed-saver.*', url): if re.match('.*fixed-online-saver.*', url) or re.match( '.*fixed-saver.*', url): #print bsobj code = "FOS" i1s = bsobj.find_all("h3", text="Current Rates") if i1s == []: i1s = bsobj.find_all("h3", text="Current rates") elif re.match('.*tracker-bond.*', url): #print bsobj code = "TB" i1s = [] res = bsobj.find_all("h4") for i in res: #print i.text if i.text == "Current rates and apply": i1s.append(i) break if i1s == []: themortgagemeter_utils.record_alert( 'No items from expected h3/4 match!', logger, themortgagemeter_db.db_connection, themortgagemeter_db.cursor) for i1 in i1s: for i2 in i1.parent(): tbodys = i2.find_all("tbody") # if this is tracker bond, discard the first table if len(tbodys) == 0: continue if code == "TB": ok = False for tbody in tbodys: for tr in tbody.find_all("tr"): tds = tr.find_all("td") if tds[0].text == "Term": ok = True if not ok: continue for tbody in tbodys: tr_count = -1 table_savings_period = "unset" for tr in tbody.find_all("tr"): tr_count = tr_count + 1 if code == "TB" and tr_count == 0: # skip the first row continue # clone the savings_data ready to write to savings_data_tmp = savings_data.copy() # First td is time only on first row for TB if code == "TB" and tr_count > 1: td_count = 1 else: td_count = 0 if code == "TB" and tr_count > 1: if table_savings_period == "unset": themortgagemeter_utils.record_alert( 'ERROR: table_savings_period should not be unset', logger, themortgagemeter_db.db_connection, themortgagemeter_db.cursor) exit() savings_data_tmp[ 'savings_period'] = table_savings_period for td in tr.find_all("td"): # 0 - term # 1 - balance # 2 - Gross # 3 - AER # 4 - NET (ignore) # Ignore remainder of cols text = td.text.lower().strip().encode('utf-8') if td_count == 0: # store this in a variable for use on next row if necessary table_savings_period = themortgagemeter_utils.get_months( text, logger) savings_data_tmp[ 'savings_period'] = table_savings_period elif td_count == 1: res = savings_util.get_money_range( text, logger) savings_data_tmp['min_amt'] = res[0] savings_data_tmp['max_amt'] = res[1] elif td_count == 2: savings_data_tmp[ 'gross_percent'] = themortgagemeter_utils.get_percentage( text, logger) elif td_count == 3: savings_data_tmp[ 'aer_percent'] = themortgagemeter_utils.get_percentage( text, logger) # and then break out break td_count = td_count + 1 savings_array.append(savings_data_tmp) elif re.match('.*/online-saver/', url): # TODO: need to set this for other types savings_data['interest_paid'] = 'Y' #print bsobj # get the apr class element, as that contains the text we need apr = bsobj.find_all( attrs={'class': 'apr'})[0].parent.parent.text.encode('utf-8') # split this line by \n apr = apr.split('\n') lines = [] for l in apr: if re.match('.* or [0-9].*', l): for l2 in l.split(' or ', 1): lines.append(l2) else: lines.append(l) while '' in lines: lines.remove('') for l in lines: # copy savings_data_tmp = savings_data.copy() #print l # get percentage savings_data_tmp[ 'gross_percent'] = themortgagemeter_utils.get_percentage( l, logger) savings_data_tmp['aer_percent'] = savings_data_tmp['gross_percent'] # get_money range res = savings_util.get_money_range(l, logger) savings_data_tmp['min_amt'] = res[0] savings_data_tmp['max_amt'] = res[1] # append to savings_array savings_array.append(savings_data_tmp) elif re.match('.*/regular-saver/', url): # TODO: need to set this for other types savings_data['interest_paid'] = 'Y' savings_data['regular_saver_frequency_period'] = '1' savings_data['regular_saver_frequency_type'] = 'M' savings_data['regular_saver'] = 'Y' # Always fixed savings_data['variability'] = 'F' #print bsobj # get the apr class element, as that contains the text we need apr = bsobj.find_all( attrs={'class': 'apr'})[0].parent.parent.text.encode('utf-8') # split this line by \n apr = apr.split('\n') lines = [] for l in apr: if re.match('.* or [0-9].*', l): for l2 in l.split(' or ', 1): lines.append(l2) else: lines.append(l) while '' in lines: lines.remove('') for l in lines: # copy savings_data_tmp = savings_data.copy() # get percentage savings_data_tmp[ 'gross_percent'] = themortgagemeter_utils.get_percentage( l, logger) if savings_data_tmp['gross_percent'] == '': # abandon ship! continue savings_data_tmp['aer_percent'] = savings_data_tmp['gross_percent'] # Hard-code to 25-250 for now, this seems standard savings_data_tmp['regular_saver_min_amt'] = '25' savings_data_tmp['regular_saver_max_amt'] = '250' # append to savings_array savings_array.append(savings_data_tmp) elif re.match('.*/everyday-saver/', url): # This one's quite simple (I think) # TODO: need to set this for other types savings_data['interest_paid'] = 'Y' #print bsobj # get the apr class element, as that contains the text we need apr = bsobj.find_all( attrs={'class': 'apr'})[0].parent.parent.text.encode('utf-8') #print apr # split this line by \n apr = apr.split('\n') lines = [] for l in apr: if re.match('.*gross.*', l): lines.append(l) while '' in lines: lines.remove('') for l in lines: # copy savings_data_tmp = savings_data.copy() #print l # get percentage savings_data_tmp[ 'gross_percent'] = themortgagemeter_utils.get_percentage( l, logger) savings_data_tmp['aer_percent'] = savings_data_tmp['gross_percent'] # TODO: bonus_frequency_period set to 1, or get from data? # append to savings_array savings_array.append(savings_data_tmp) elif re.match('.*/branch-accounts/.*', url): return savings_array else: logger.info('unhandled:' + url) exit() if savings_array == []: themortgagemeter_utils.record_alert( 'ERROR: returning nothing from a page', logger, themortgagemeter_db.db_connection, themortgagemeter_db.cursor) exit() # Return the savings_array logger.info('returning savings_array:' + str(savings_array)) return savings_array
def get_product_page(static,url,eligibilities): logger = logging.getLogger('retrieve') bsobj = themortgagemeter_utils.get_page(static,'static_html/post_office/our-full-range.html',url,logger) #print bsobj term = str(25 * 12) ltv_elems = bsobj.find_all('h2') # foreach h2 element, determine the ltv. # then get the next element (which is the div, class displaytable). Then, for each tr: # td0 = years of fixed or tracker # td1 = initial rate # td2 = svr # td3 = apr # td4 = fees for ltv_elem in ltv_elems: # For post office, first reported % is 100 - LTV ltv_elem_str = ltv_elem.string if (ltv_elem_str): ltv_percent = themortgagemeter_utils.get_percentage(ltv_elem_str,logger) if ltv_percent != '': ltv_percent = str(100 - int(ltv_percent)) else: continue else: continue div = ltv_elem.fetchNextSiblings(attrs={'class' : 'displaytable'},limit=1) if (div): logger.debug('here') logger.debug(div) trs = div[0].find_all('tr') for tr in trs: logger.debug(tr) # initialise: rate_percent = '' svr_percent = '' apr_percent = '' booking_fee = '' tds = tr.find_all('td') i = 0 # If there are tds and there are more than 1 of them then we can extract a mortgage... logger.debug(tr) if tds and len(tds) > 1: logger.debug(tds[0].text.encode('utf-8').split('\n')) s = tds[0].text.encode('utf-8').split('\n') # Sometimes we get empty fields - we remove them here. while '' in s: s.remove('') initial_period = str(themortgagemeter_utils.get_months(s[i],logger)) #logger.debug('type_str before split: ' + tds[i].text.encode('utf-8')) #logger.debug('tds i: ' + str(i) + ' tds: ' + str(tds)) #logger.debug('tds i: ' + str(i) + ' tds[i]: ' + str(tds[i].text.encode('utf-8'))) #logger.debug(re.sub('\xa0','',tds[i].text.encode('utf-8')).split()) # TODO: generic text cleansing function type_str = re.sub('\xa0','',re.sub('\xc2',' ',tds[i].text.encode('utf-8'))).split()[2] logger.debug('type_str: ' + type_str) if type_str == 'fixed': mortgage_type = 'F' elif type_str == 'tracker': mortgage_type = 'T' else: themortgagemeter_utils.record_alert('ERROR: PSTFFC neither fixed nor tracker: ' + type_str,logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) i+=1 j = 0 for td in tds[i].text.encode('utf-8').split('\n'): t = tds[i].text.encode('utf-8').split('\n')[j] rate_percent = themortgagemeter_utils.get_percentage(t,logger) if rate_percent != '': break j += 1 while svr_percent == '': i+=1 for t in tds[i].text.encode('utf-8').split('\n'): svr_percent = themortgagemeter_utils.get_percentage(t,logger) if svr_percent != '': break while apr_percent == '': i+=1 for t in tds[i].text.encode('utf-8').split('\n'): apr_percent = themortgagemeter_utils.get_percentage(t,logger) if apr_percent != '': break i+=1 booking_fee = tds[i].text.strip().encode('utf-8')[2:].replace(',','') for eligibility in eligibilities: mc_util.handle_mortgage_insert(institution_code,mortgage_type,rate_percent,svr_percent,apr_percent,ltv_percent,initial_period,booking_fee,term,url,eligibility,logger) else: pass
def get_product_page_details(url, savings_data): logger = logging.getLogger('retrieve') #logger.info(url) bsobj = themortgagemeter_utils.get_page(False, '', url, logger)
def halifax_remortgage_page(static, url, mortgage_type, eligibility, logger): bsobj = themortgagemeter_utils.get_page( static, 'static_html/halifax/remortgage-fixed-75ltv.asp', url, logger) trs = bsobj.find_all('tr') for tr in trs: mortgage_details = [] for d in tr.strings: mortgage_details.append(string.strip(d.encode('utf-8'))) #['\n', 'Term', 'Initial rate', '\xc2\xa0', 'Halifax Homeowner Variable rate thereafter', '\xc2\xa0', 'For the remainder of the term from', '\xc2\xa0', 'The overall cost for comparison is', '\xc2\xa0', 'Product fee', '\xc2\xa0', 'LTV\xc2\xa0\xc2\xa0\xc2\xa0\xc2\xa0\xc2\xa0\xc2\xa0 ', 'Early Repayment Charges until', '\xc2\xa0', 'Loan amount', '\n', 'Extra benefits', '\xc2\xa0', '\n', '\xc2\xa0', '\n'] #['\n', '2 years', '\n', '4.44%', '\n', 'Currently', ' \xc2\xa03.99%', '\n', '30/11/2014', '\n', '4.3% APR', '\n', '\xc2\xa3995', '\n', '75-80%', '\n', '30/11/2014', '\n', '\xc2\xa30-\xc2\xa31m', '\n', 'Halifax Remortgage Service*', '\n', '\n'] logger.debug(mortgage_details) if len(mortgage_details) > 19 and len(mortgage_details) < 25: if mortgage_details[3].find('%') != -1: rate_percent = mortgage_details[3][:-1] svr_percent = mortgage_details[6].split()[0][:-1].strip( '\xc2').strip('\xa0') apr_percent = mortgage_details[10].split()[0][:-1] booking_fee = mortgage_details[12][2:].replace(',', '') initial_period = mortgage_details[1] # handle special nonsense case if re.search(r'years', initial_period) and not re.search( r'[0-9]+ years', initial_period): years = initial_period[0] initial_period = str(int(years) * 12) elif re.search(r'months', initial_period) and not re.search( r'[0-9]+ month', initial_period): initial_period = initial_period[0:2] else: initial_period = str( themortgagemeter_utils.get_months( initial_period, logger)) ltv_percent = mortgage_details[14].split('-')[1].strip('%') mc_util.handle_mortgage_insert(institution_code, mortgage_type, rate_percent, svr_percent, apr_percent, ltv_percent, initial_period, booking_fee, term, url, eligibility, logger) elif len(mortgage_details) == 25: if mortgage_details[3].find('%') != -1: rate_percent = mortgage_details[3][:-1] svr_percent = mortgage_details[8].split()[0][:-1].strip( '\xc2').strip('\xa0') apr_percent = mortgage_details[12].split()[0][:-1] booking_fee = mortgage_details[14][2:].replace(',', '') initial_period = mortgage_details[1] if re.search(r'years', initial_period) and not re.search( r'[0-9]+ years', initial_period): years = initial_period[0] initial_period = str(int(years) * 12) elif re.search(r'months', initial_period) and not re.search( r'[0-9]+ month', initial_period): initial_period = initial_period[0:2] else: initial_period = str( themortgagemeter_utils.get_months( initial_period, logger)) ltv_percent = mortgage_details[16].split('-')[1].strip('%') # handle special nonsense case mc_util.handle_mortgage_insert(institution_code, mortgage_type, rate_percent, svr_percent, apr_percent, ltv_percent, initial_period, booking_fee, term, url, eligibility, logger) elif len(mortgage_details) > 3: logger.debug('Should this be handled?: %s', (mortgage_details))
def get_product_pages(static,base_url,ext_url,logger): url = base_url + ext_url urls_seen = [] bsobj = themortgagemeter_utils.get_page(static,'static_html/halifax/savings-accounts.html',url,logger) # let's see how much info we can extract from the page # Get all the sortable tables and divine as much info as possible from that. sortable_tables = doormatCols = bsobj.find_all(attrs={'class' : 'sortableTable'}) for table in sortable_tables: #print table for tr in table.find_all('tr'): td_idx = 0 savings_data = savings_util.get_savings_data_object() for td in tr.find_all('td'): td_text = td.text.encode('utf-8').strip().lower() if td_idx == 0: # title of account - Junior == child #re.match('/product/A[0-9]+.*',href) ##0 ##<td style="text-align: left;"><a href="/savings/accounts/cash-isas/isa-saver-online/">ISA Saver Online</a></td> ##1 ##<td style="text-align: left;"><strong class="apr">1.35%</strong> tax free/AER variable including 12 month fixed bonus of<strong> </strong>1.10%</td> ##2 ##<td>£1</td> ##3 ##<td>Variable</td> ##4 ##<td>Unlimited</td> ##5 ##<td style="text-align: center;"><img alt="Online" src="/common/images/icons/mousegrey.gif" title="Online"/></td> ##6 ##<td><a href="/savings/accounts/cash-isas/isa-saver-online/"><img alt="Find out more" src="/common/images/Buttons/primary_find_out_more.gif"/></a></td> #print "0: " + td_text if re.match('^.*isa.*$',td_text): savings_data['isa'] = 'Y' if re.match('^.*junior.*$',td_text): savings_data['child'] = 'Y' elif td_idx == 1: #print "1: " + td_text # We don't bother with this at the moment - TODO - sort this out #if re.match('.*bonus.*',td_text): # savings_data['bonus'] = 'Y' pass elif td_idx == 2: #print "2: " + td_text # minimum investment, max is always infinity min_amt = themortgagemeter_utils.get_money(td_text,logger) savings_data['min_amt'] = min_amt elif td_idx == 3: #print "3: " + td_text # Variable/Fixed if re.match('.*variable.*',td_text): savings_data['variability'] = 'V' elif re.match('.*fixed.*',td_text): savings_data['variability'] = 'F' else: themortgagemeter_utils.record_alert('ERROR: unknown variability: ' + td_text,logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) exit() elif td_idx == 4: #print "4: " + td_text # Let's assume we'll get this info from the sub-page. # Withdrawals allowed: "None, by closure only", "Unlimited", "None, until child is 18" pass elif td_idx == 5: #print "5: " + td_text for img in td.find_all('img'): title = img.get('title').lower().strip() if title == 'online': savings_data['online'] = 'Y' elif title == 'branch': savings_data['branch'] = 'Y' # I'm going to ignore "phone" elif title == 'phone': pass elif td_idx == 6: #print "6: " + td_text # more details link new_url = base_url + td.find_all('a')[0].get('href') if new_url in urls_seen: continue savings_array = process_more_info_page(savings_data,new_url,logger) print new_url for this_savings_data in savings_array: # insert savings here TODO. print this_savings_data isa = this_savings_data['isa'] regular_saver = this_savings_data['regular_saver'] regular_saver_frequency_period = this_savings_data['regular_saver_frequency_period'] regular_saver_frequency_type = this_savings_data['regular_saver_frequency_type'] regular_saver_min_amt = this_savings_data['regular_saver_min_amt'] regular_saver_max_amt = this_savings_data['regular_saver_max_amt'] bonus = this_savings_data['bonus'] bonus_frequency_period = this_savings_data['bonus_frequency_period'] bonus_frequency_type = this_savings_data['bonus_frequency_type'] online = this_savings_data['online'] branch = this_savings_data['branch'] variability = this_savings_data['variability'] min_amt = this_savings_data['min_amt'] max_amt = this_savings_data['max_amt'] gross_percent = this_savings_data['gross_percent'] aer_percent = this_savings_data['aer_percent'] interest_paid = this_savings_data['interest_paid'] child = this_savings_data['child'] savings_period = this_savings_data['savings_period'] savings_util.handle_savings_insert(institution_code, isa, regular_saver, regular_saver_frequency_period, regular_saver_frequency_type, regular_saver_min_amt, regular_saver_max_amt, bonus, bonus_frequency_period, bonus_frequency_type, online, branch, variability, savings_period, min_amt, max_amt, gross_percent, aer_percent, child, interest_paid, url, logger) urls_seen.insert(0,new_url) else: themortgagemeter_utils.record_alert('ERROR: too many tds in tr: ' + tr,logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) exit() td_idx = td_idx + 1
def get_product_page(static, url, eligibilities): logger = logging.getLogger('retrieve') bsobj = themortgagemeter_utils.get_page( static, 'static_html/post_office/our-full-range.html', url, logger) #print bsobj term = str(25 * 12) ltv_elems = bsobj.find_all('h2') # foreach h2 element, determine the ltv. # then get the next element (which is the div, class displaytable). Then, for each tr: # td0 = years of fixed or tracker # td1 = initial rate # td2 = svr # td3 = apr # td4 = fees for ltv_elem in ltv_elems: # For post office, first reported % is 100 - LTV ltv_elem_str = ltv_elem.string if (ltv_elem_str): ltv_percent = themortgagemeter_utils.get_percentage( ltv_elem_str, logger) if ltv_percent != '': ltv_percent = str(100 - int(ltv_percent)) else: continue else: continue div = ltv_elem.fetchNextSiblings(attrs={'class': 'displaytable'}, limit=1) if (div): logger.debug('here') logger.debug(div) trs = div[0].find_all('tr') for tr in trs: logger.debug(tr) # initialise: rate_percent = '' svr_percent = '' apr_percent = '' booking_fee = '' tds = tr.find_all('td') i = 0 # If there are tds and there are more than 1 of them then we can extract a mortgage... logger.debug(tr) if tds and len(tds) > 1: logger.debug(tds[0].text.encode('utf-8').split('\n')) s = tds[0].text.encode('utf-8').split('\n') # Sometimes we get empty fields - we remove them here. while '' in s: s.remove('') initial_period = str( themortgagemeter_utils.get_months(s[i], logger)) #logger.debug('type_str before split: ' + tds[i].text.encode('utf-8')) #logger.debug('tds i: ' + str(i) + ' tds: ' + str(tds)) #logger.debug('tds i: ' + str(i) + ' tds[i]: ' + str(tds[i].text.encode('utf-8'))) #logger.debug(re.sub('\xa0','',tds[i].text.encode('utf-8')).split()) # TODO: generic text cleansing function type_str = re.sub( '\xa0', '', re.sub('\xc2', ' ', tds[i].text.encode('utf-8'))).split()[2] logger.debug('type_str: ' + type_str) if type_str == 'fixed': mortgage_type = 'F' elif type_str == 'tracker': mortgage_type = 'T' else: themortgagemeter_utils.record_alert( 'ERROR: PSTFFC neither fixed nor tracker: ' + type_str, logger, themortgagemeter_db.db_connection, themortgagemeter_db.cursor) i += 1 j = 0 for td in tds[i].text.encode('utf-8').split('\n'): t = tds[i].text.encode('utf-8').split('\n')[j] rate_percent = themortgagemeter_utils.get_percentage( t, logger) if rate_percent != '': break j += 1 while svr_percent == '': i += 1 for t in tds[i].text.encode('utf-8').split('\n'): svr_percent = themortgagemeter_utils.get_percentage( t, logger) if svr_percent != '': break while apr_percent == '': i += 1 for t in tds[i].text.encode('utf-8').split('\n'): apr_percent = themortgagemeter_utils.get_percentage( t, logger) if apr_percent != '': break i += 1 booking_fee = tds[i].text.strip().encode( 'utf-8')[2:].replace(',', '') for eligibility in eligibilities: mc_util.handle_mortgage_insert( institution_code, mortgage_type, rate_percent, svr_percent, apr_percent, ltv_percent, initial_period, booking_fee, term, url, eligibility, logger) else: pass
def get_product_page_interest_rates(url, savings_data): logger = logging.getLogger('retrieve ' + url) bsobj = themortgagemeter_utils.get_page(False, '', url, logger) #logger.info(url) #logger.info(bsobj) if re.match('.*isa.*', url): savings_data['isa'] = 'Y' for t in bsobj.find_all('table'): #logger.info("TABLE")# logger.info(t) # Get all tables, then match on summary == "Interest rates:.*", and set up variables accordingly. summary = t.get('summary').encode('utf-8').lower() if summary: # Set up data for this page summary_info = re.match('.*interest rates: (.*)', summary).group(1) #logger.info("summary info: " + summary_info) if summary_info in ("cash e-isa#"): savings_data['isa'] = 'Y' elif summary_info in ("fixed rate saver - monthly interest"): savings_data['variability'] = 'F' savings_data['interest_paid'] = 'M' elif summary_info in ("fixed rate saver - annual interest"): savings_data['variability'] = 'F' savings_data['interest_paid'] = 'Y' elif "regular saver" in summary_info: savings_data['regular_saver'] = 'Y' savings_data['interest_paid'] = 'Y' elif "online bonus" in summary_info: savings_data['bonus'] = 'Y' savings_data['branch'] = 'N' savings_data['bonus_frequency_period'] = '1' savings_data['bonus_frequency_type'] = 'M' # skip bonus for HSBC- it's complicated - probably needs its own function TODO continue elif "flexible saver" in summary_info: savings_data['variability'] = 'V' else: themortgagemeter_utils.record_alert( 'NEED TO HANDLE: ' + summary_info, logger, themortgagemeter_db.db_connection, themortgagemeter_db.cursor) exit() tr_count = 0 for tr in t.find_all('tr'): # This is a new savings product, so clone the data at this point and use that from here. this_savings_data = savings_data.copy() #logger.info("TR " + str(tr_count)) #logger.info(tr) if this_savings_data['bonus'] == 'Y': #print "BONUS" #print tr pass if this_savings_data['regular_saver'] == 'Y': td_count = -1 else: td_count = 0 if tr_count >= 1: # If tax-free, this will be true for td in tr.find_all('td'): td_style = td.get('style') if td_style != None: td_style = td_style.lower().encode( 'utf-8').translate(None, ' ') if td_style == 'vertical-align:middle': continue #logger.info("TD" + str(td_count)) #logger.info(tr_count) #logger.info(td_count) logger.info(td) v = td.text.encode('utf-8').lower().strip() if td_count == 0: #logger.info(this_savings_data['regular_saver']) if this_savings_data['regular_saver'] == 'Y': logger.info('regular_saver: ' + v) this_savings_data[ 'regular_saver_min_amt'] = v.split()[0][2:] this_savings_data[ 'regular_saver_max_amt'] = v.split()[2][2:] if v.split()[4] == "month": this_savings_data[ 'regular_saver_frequency_period'] = '1' this_savings_data[ 'regular_saver_frequency_type'] = 'M' else: themortgagemeter_utils.record_alert( 'ERROR: reg saver not parsed: ' + v, logger, themortgagemeter_db.db_connection, themortgagemeter_db.cursor) exit() else: # if it's got a + at the end, it's a min, if it's "up to" it's a max. res = savings_util.get_money_range(v, logger) this_savings_data['min_amt'] = res[0] this_savings_data['max_amt'] = res[1] # TODO: remove this section #if re.match('^.*\+$',v): # money_val = themortgagemeter_utils.get_money(v,logger) # this_savings_data['min_amt'] = money_val #elif re.match('^.*up to.*$',v) or re.match('^.*under.*$',v): # money_val = themortgagemeter_utils.get_money(v,logger) # this_savings_data['max_amt'] = money_val # this_savings_data['min_amt'] = 0 #elif re.match('^.* - .*$',v): # this_savings_data['min_amt'] = v.split()[0][2:].translate(None,',') # this_savings_data['max_amt'] = v.split()[2][2:].translate(None,',') #else: # #logger.info(t) #logger.info('value not handled: ' + v) # themortgagemeter_utils.record_alert('ERROR: value wrong: ' + v,logger,themortgagemeter_db.db_connection,themortgagemeter_db.cursor) # exit() elif td_count == 1: # we don't bother with net_percent pass elif td_count == 2: # gross % this_savings_data['gross_percent'] = v elif td_count == 3: this_savings_data['aer_percent'] = v td_count += 1 # Some trs have no tds; we ignore those. if td_count > 0: # Now store this product # TODO: fixed savings? logger.info(this_savings_data) isa = this_savings_data['isa'] regular_saver = this_savings_data['regular_saver'] regular_saver_frequency_period = this_savings_data[ 'regular_saver_frequency_period'] regular_saver_frequency_type = this_savings_data[ 'regular_saver_frequency_type'] regular_saver_min_amt = this_savings_data[ 'regular_saver_min_amt'] regular_saver_max_amt = this_savings_data[ 'regular_saver_max_amt'] bonus = this_savings_data['bonus'] bonus_frequency_period = this_savings_data[ 'bonus_frequency_period'] bonus_frequency_type = this_savings_data[ 'bonus_frequency_type'] online = this_savings_data['online'] branch = this_savings_data['branch'] variability = this_savings_data['variability'] min_amt = this_savings_data['min_amt'] max_amt = this_savings_data['max_amt'] gross_percent = this_savings_data['gross_percent'] aer_percent = this_savings_data['aer_percent'] interest_paid = this_savings_data['interest_paid'] child = this_savings_data['child'] savings_period = this_savings_data['savings_period'] savings_util.handle_savings_insert( institution_code, isa, regular_saver, regular_saver_frequency_period, regular_saver_frequency_type, regular_saver_min_amt, regular_saver_max_amt, bonus, bonus_frequency_period, bonus_frequency_type, online, branch, variability, savings_period, min_amt, max_amt, gross_percent, aer_percent, child, interest_paid, url, logger) else: tr_count += 1 continue tr_count += 1 else: #print url #print bsobj exit()