def parse(self, response): f = open("output-ing.csv", "a+") self.driver.get(response.url) now = datetime.datetime.now() sel = scrapy.Selector(text=self.driver.page_source) ctr = 0 tables = sel.xpath( '//table[@class="table table-b table-lr-unpadded l-mb-0"]/tbody') title = sel.xpath( "//table[@class='table table-b table-lr-unpadded l-mb-0']/thead/tr/th/p/text()" ).extract() title = str(title).split()[0] #validity since date search validString = sel.xpath( "//p[contains(@class, 'small-font') and contains(text(),'Deze tarieven gelden voor nieuwe offertes en renteaanpassingen voor bestaande hypotheken uitgebracht vanaf')]/text()" ).extract() validString = doFormattingUnicode(str(validString[0])) log.msg("Date ------------- %s" % validString, level=log.DEBUG) regex = r"([0-9]+\s+[a-z]+\s+[0-9])\w+" pattern = re.compile(regex) matches = re.search(regex, validString, re.DOTALL) if matches: log.msg("Date ------------- %s" % matches.group(0), level=log.DEBUG) date = changeMonth(matches.group(0)) date = parse(date) date = str(date).split()[0] for i in range(0, 2): #tables rows = tables[i].xpath("tr") headers = tables[i].xpath("tr/td/strong/text()").extract() for j in range(1, len(rows)): datas = rows[j].xpath("td/text()").extract() for k in range(1, len(datas)): f.write("NL;ING;") f.write("ING") f.write(doFormattingUnicode(title)) f.write(";") f.write("Annuiteitenhypotheek") f.write(";") f.write(datas[0]) f.write(";") data = doFormattingUnicode(datas[k]) f.write(data) f.write(";") temp = headers[k - 1].encode("utf-8").strip() f.write(temp) f.write(";") f.write(str(now.strftime("%Y-%m-%d"))) f.write(";") f.write(date) f.write(";") f.write("N;\n") self.driver.close()
def scrap(divRow, validSince): f = open("output-rabo.csv", "a+") now = datetime.datetime.now() tableRows = divRow.xpath("div/table/tbody/tr") headers = tableRows[0].xpath("td") for i in range(1, len(tableRows)): tableDatas = tableRows[i].xpath("td") for j in range(1, len(tableDatas)): #log.msg("XX %s" % str(tableDatas[j].xpath("text()").extract()).replace("[u'","").replace("']",""), level = log.DEBUG) productName = doFormattingProductName( (str(divRow.xpath("h2/text()").extract()))) #static values f.write("NL;Robobank;") f.write("Rabobank") f.write(doFormattingUnicode(productName)) f.write(";") #static values start f.write("Annuiteitenhypotheek;") f.write( doFormattingUnicode( str(tableDatas[0].xpath("text()").extract()))) f.write(";") f.write( doFormattingUnicode( str(tableDatas[j].xpath("text()").extract()))) f.write(";") if (j != 0): coverage = doFormattingUnicode( str(headers[j].xpath("strong/text()").extract())) coverage = findCoverage(coverage).split() coverageStart = coverage[0] coverageEnd = coverage[1] f.write(coverageStart) f.write(";") f.write(coverageEnd) f.write(";") f.write(str(now.strftime("%Y-%m-%d"))) f.write(";") f.write(validSince) f.write(";") #static values start f.write("N;") f.write("\n") #f.write("\n") #f.write("\n\n") f.close()
def rabo_scraper(divRow, validSince): dlist = [] now = datetime.datetime.now() tableRows = divRow.xpath("div/table/tbody/tr") headers = tableRows[0].xpath("td") for i in range(1, len(tableRows)): tableDatas = tableRows[i].xpath("td") for j in range(1, len(tableDatas)): #log.msg("XX %s" % str(tableDatas[j].xpath("text()").extract()).replace("[u'","").replace("']",""), level = log.DEBUG) item = {} productName = doFormattingUnicode( doFormattingProductName( (str(divRow.xpath("h2/text()").extract())))) #static values #Country Code, Provider Name item['CountryCode'] = "NL" item['ProviderName'] = "Rabobank" #Product Name item['ProductName'] = "Rabobank" + productName #Loan Type item['LoanType'] = "Annuiteitenhypotheek" #Period item['Period'] = doFormattingUnicode( str(tableDatas[0].xpath("text()").extract())) #Interest Rate item['Rate'] = doFormattingUnicode( str(tableDatas[j].xpath("text()").extract())) coverage = doFormattingUnicode( str(headers[j].xpath("strong/text()").extract())) coverage = findCoverage(coverage).split() coverageStart = coverage[0] coverageEnd = coverage[1] item['CoverageStart'] = coverageStart item['CoverageEnd'] = coverageEnd item['CheckDate'] = str(now.strftime("%Y-%m-%d")) item['ValidSince'] = validSince item['NHG'] = "N" dlist.append(item) return dlist
def parse(self, response): log.msg('parse(%s)' % response.url, level=log.DEBUG) #remove output file if exists try: os.remove("output.csv") except OSError: pass divRows = response.xpath("//*[@class='s14-lamella--shadow']") items = RabobankItem() #target table headings (h3) target1 = [u'Alle rentepercentages hypotheek met Basisvoorwaarden'] target2 = [u'Alle rentepercentages hypotheek met Plusvoorwaarden'] #validity since date search validString = response.xpath( "//li[contains(text(),'totdat wij de tarieven wijzigen')]/text()" ).extract() validString = doFormattingUnicode(str(validString)) regex = r"([0-9]+\s+[a-z]+\s+[0-9])\w+" pattern = re.compile(regex) matches = re.search(regex, validString, re.DOTALL) if matches: date = parse(matches.group(0)) date = str(date).split()[0] #log.msg("ValidityDiv -----------------------------------------------%s" % date, level = log.DEBUG) #validString = "18 september 2017" f = open("output.csv", "a+") #f.write("CountryCode;ProviderName;ProductName;LoanType;Period;Rate;Coverage;Check Date;ValidSinceDate;NHG\n") for divRow in divRows: head = divRow.xpath("div/h2/text()").extract() if (head == target1): divRow = divRow.xpath("div") scrap(divRow, date) if (head == target2): #log.msg("div row ----- %s" % str(divRow.xpath("div/div/table").extract()), level = log.DEBUG) divRow = divRow.xpath("div") scrap(divRow, date) f.close() yield items
def parse(self, response): #ING Bank I self.driver = webdriver.PhantomJS() self.driver.set_window_size(1120, 550) #remove output file if exists #try: # os.remove("ing_data.csv") #except OSError: # pass #f = open("output-ing.csv", "a+") items = [] self.driver.get(response.url) #self.driver.implicitly_wait(50) log.msg("URL ----- %s" % response.url, level=log.DEBUG) sel = scrapy.Selector(text=self.driver.page_source) ctr = 0 productName1 = "Annuïteitenhypotheek" productName2 = "Lineaire hypotheek" #validity since date search validString = sel.xpath( "//p[contains(@class, 'small-font') and contains(text(),'Deze tarieven gelden voor nieuwe offertes en renteaanpassingen voor bestaande hypotheken uitgebracht vanaf')]/text()" ).extract() validString = doFormattingUnicode( str(validString[0]).encode("utf-8").strip()) regex = r"([0-9]+\s+[a-z]+\s+[0-9])\w+" pattern = re.compile(regex) matches = re.search(regex, validString, re.DOTALL) if matches: date = changeMonth(matches.group(0)) date = parse(date) date = str(date).split()[0] ing_scraper(productName1, sel, date) ing_scraper(productName2, sel, date) self.driver.close() #ING Bank II self.driver = webdriver.PhantomJS() self.driver.set_window_size(1120, 550) url2 = 'https://www.ing.nl/particulier/hypotheken/actuele-hypotheekrente/actuele-hypotheekrente-andere-hypotheken/index.html' self.driver.get(url2) sel = scrapy.Selector(text=self.driver.page_source) log.msg("URL ----- %s" % url2, level=log.DEBUG) validString = sel.xpath( "//small[contains(text(),'Deze tarieven gelden voor nieuwe offertes en renteaanpassingen voor bestaande hypotheken uitgebracht vanaf')]/text()" ).extract() log.msg("String ----> %s" % validString, level=log.DEBUG) validString = doFormattingUnicode( (validString[0]).encode("utf-8").strip()) regex = r"([0-9]+\s+[a-z]+\s+[0-9])\w+" pattern = re.compile(regex) matches = re.search(regex, validString, re.DOTALL) if matches: date = changeMonth(matches.group(0)) date = parse(date) date = str(date).split()[0] productName3 = "Aflossingsvrije hypotheek" productName4 = "Bankspaarhypotheek" ing_scraper(productName3, sel, date) ing_scraper(productName4, sel, date) #Rabobank self.driver = webdriver.PhantomJS() self.driver.set_window_size(1120, 550) url2 = 'https://www.rabobank.nl/particulieren/hypotheek/hypotheekrente/?intcamp=pa-hypotheek&inttype=tegel-hypotheekrente&intsource=hypotheek' self.driver.get(url2) sel = scrapy.Selector(text=self.driver.page_source) log.msg("URL ----- %s" % url2, level=log.DEBUG) divRows = sel.xpath("//*[@class='s14-lamella--shadow']") #target table headings (h3) target1 = [u'Alle rentepercentages hypotheek met Basisvoorwaarden'] target2 = [u'Alle rentepercentages hypotheek met Plusvoorwaarden'] #validity since date search validString = sel.xpath( "//li[contains(text(),'totdat wij de tarieven wijzigen')]/text()" ).extract() validString = changeMonth(doFormattingUnicode(str(validString))) regex = r"([0-9]+\s+[a-z]+\s+[0-9])\w+" pattern = re.compile(regex) matches = re.search(regex, validString, re.DOTALL) if matches: date = parse(matches.group(0)) date = str(date).split()[0] f = open("output.csv", "a+") for divRow in divRows: head = divRow.xpath("div/h2/text()").extract() if (head == target1): divRow = divRow.xpath("div") rabo_scraper(divRow, date) if (head == target2): #log.msg("div row ----- %s" % str(divRow.xpath("div/div/table").extract()), level = log.DEBUG) divRow = divRow.xpath("div") rabo_scraper(divRow, date) f.close() return items