コード例 #1
0
    def parse(self, response):
        f = open("output-ing.csv", "a+")
        self.driver.get(response.url)
        now = datetime.datetime.now()
        sel = scrapy.Selector(text=self.driver.page_source)
        ctr = 0
        tables = sel.xpath(
            '//table[@class="table table-b table-lr-unpadded l-mb-0"]/tbody')

        title = sel.xpath(
            "//table[@class='table table-b table-lr-unpadded l-mb-0']/thead/tr/th/p/text()"
        ).extract()
        title = str(title).split()[0]

        #validity since date search
        validString = sel.xpath(
            "//p[contains(@class, 'small-font') and contains(text(),'Deze tarieven gelden voor nieuwe offertes en renteaanpassingen voor bestaande hypotheken uitgebracht vanaf')]/text()"
        ).extract()
        validString = doFormattingUnicode(str(validString[0]))
        log.msg("Date ------------- %s" % validString, level=log.DEBUG)

        regex = r"([0-9]+\s+[a-z]+\s+[0-9])\w+"
        pattern = re.compile(regex)
        matches = re.search(regex, validString, re.DOTALL)
        if matches:
            log.msg("Date ------------- %s" % matches.group(0),
                    level=log.DEBUG)
            date = changeMonth(matches.group(0))
            date = parse(date)
            date = str(date).split()[0]

        for i in range(0, 2):  #tables
            rows = tables[i].xpath("tr")
            headers = tables[i].xpath("tr/td/strong/text()").extract()
            for j in range(1, len(rows)):
                datas = rows[j].xpath("td/text()").extract()
                for k in range(1, len(datas)):
                    f.write("NL;ING;")
                    f.write("ING")
                    f.write(doFormattingUnicode(title))
                    f.write(";")
                    f.write("Annuiteitenhypotheek")
                    f.write(";")
                    f.write(datas[0])
                    f.write(";")
                    data = doFormattingUnicode(datas[k])
                    f.write(data)
                    f.write(";")
                    temp = headers[k - 1].encode("utf-8").strip()
                    f.write(temp)
                    f.write(";")
                    f.write(str(now.strftime("%Y-%m-%d")))
                    f.write(";")
                    f.write(date)
                    f.write(";")
                    f.write("N;\n")

        self.driver.close()
コード例 #2
0
def scrap(divRow, validSince):
    f = open("output-rabo.csv", "a+")

    now = datetime.datetime.now()
    tableRows = divRow.xpath("div/table/tbody/tr")
    headers = tableRows[0].xpath("td")

    for i in range(1, len(tableRows)):
        tableDatas = tableRows[i].xpath("td")
        for j in range(1, len(tableDatas)):
            #log.msg("XX %s" % str(tableDatas[j].xpath("text()").extract()).replace("[u'","").replace("']",""), level = log.DEBUG)
            productName = doFormattingProductName(
                (str(divRow.xpath("h2/text()").extract())))
            #static values
            f.write("NL;Robobank;")
            f.write("Rabobank")
            f.write(doFormattingUnicode(productName))
            f.write(";")

            #static values start
            f.write("Annuiteitenhypotheek;")

            f.write(
                doFormattingUnicode(
                    str(tableDatas[0].xpath("text()").extract())))
            f.write(";")
            f.write(
                doFormattingUnicode(
                    str(tableDatas[j].xpath("text()").extract())))
            f.write(";")
            if (j != 0):
                coverage = doFormattingUnicode(
                    str(headers[j].xpath("strong/text()").extract()))
                coverage = findCoverage(coverage).split()
                coverageStart = coverage[0]
                coverageEnd = coverage[1]
                f.write(coverageStart)
                f.write(";")
                f.write(coverageEnd)
                f.write(";")
                f.write(str(now.strftime("%Y-%m-%d")))
                f.write(";")
                f.write(validSince)
                f.write(";")
                #static values start
                f.write("N;")
                f.write("\n")

            #f.write("\n")
    #f.write("\n\n")
    f.close()
コード例 #3
0
def rabo_scraper(divRow, validSince):
    dlist = []
    now = datetime.datetime.now()
    tableRows = divRow.xpath("div/table/tbody/tr")
    headers = tableRows[0].xpath("td")

    for i in range(1, len(tableRows)):
        tableDatas = tableRows[i].xpath("td")
        for j in range(1, len(tableDatas)):
            #log.msg("XX %s" % str(tableDatas[j].xpath("text()").extract()).replace("[u'","").replace("']",""), level = log.DEBUG)
            item = {}
            productName = doFormattingUnicode(
                doFormattingProductName(
                    (str(divRow.xpath("h2/text()").extract()))))
            #static values
            #Country Code, Provider Name
            item['CountryCode'] = "NL"
            item['ProviderName'] = "Rabobank"
            #Product Name
            item['ProductName'] = "Rabobank" + productName
            #Loan Type
            item['LoanType'] = "Annuiteitenhypotheek"
            #Period
            item['Period'] = doFormattingUnicode(
                str(tableDatas[0].xpath("text()").extract()))
            #Interest Rate
            item['Rate'] = doFormattingUnicode(
                str(tableDatas[j].xpath("text()").extract()))
            coverage = doFormattingUnicode(
                str(headers[j].xpath("strong/text()").extract()))
            coverage = findCoverage(coverage).split()
            coverageStart = coverage[0]
            coverageEnd = coverage[1]
            item['CoverageStart'] = coverageStart
            item['CoverageEnd'] = coverageEnd
            item['CheckDate'] = str(now.strftime("%Y-%m-%d"))
            item['ValidSince'] = validSince
            item['NHG'] = "N"
            dlist.append(item)
    return dlist
コード例 #4
0
    def parse(self, response):
        log.msg('parse(%s)' % response.url, level=log.DEBUG)

        #remove output file if exists
        try:
            os.remove("output.csv")
        except OSError:
            pass

        divRows = response.xpath("//*[@class='s14-lamella--shadow']")
        items = RabobankItem()

        #target table headings (h3)
        target1 = [u'Alle rentepercentages hypotheek met Basisvoorwaarden']
        target2 = [u'Alle rentepercentages hypotheek met Plusvoorwaarden']

        #validity since date search
        validString = response.xpath(
            "//li[contains(text(),'totdat wij de tarieven wijzigen')]/text()"
        ).extract()
        validString = doFormattingUnicode(str(validString))

        regex = r"([0-9]+\s+[a-z]+\s+[0-9])\w+"
        pattern = re.compile(regex)
        matches = re.search(regex, validString, re.DOTALL)
        if matches:
            date = parse(matches.group(0))
            date = str(date).split()[0]
            #log.msg("ValidityDiv -----------------------------------------------%s" % date, level = log.DEBUG)

        #validString = "18 september 2017"

        f = open("output.csv", "a+")
        #f.write("CountryCode;ProviderName;ProductName;LoanType;Period;Rate;Coverage;Check Date;ValidSinceDate;NHG\n")

        for divRow in divRows:
            head = divRow.xpath("div/h2/text()").extract()
            if (head == target1):
                divRow = divRow.xpath("div")
                scrap(divRow, date)

            if (head == target2):
                #log.msg("div row ----- %s" % str(divRow.xpath("div/div/table").extract()), level = log.DEBUG)
                divRow = divRow.xpath("div")
                scrap(divRow, date)

            f.close()
        yield items
コード例 #5
0
ファイル: ing_scrape.py プロジェクト: abpattnayak/Rabobank
    def parse(self, response):

        #ING Bank I

        self.driver = webdriver.PhantomJS()
        self.driver.set_window_size(1120, 550)
        #remove output file if exists
        #try:
        #    os.remove("ing_data.csv")
        #except OSError:
        #    pass
        #f = open("output-ing.csv", "a+")

        items = []
        self.driver.get(response.url)
        #self.driver.implicitly_wait(50)
        log.msg("URL ----- %s" % response.url, level=log.DEBUG)
        sel = scrapy.Selector(text=self.driver.page_source)
        ctr = 0

        productName1 = "Annuïteitenhypotheek"
        productName2 = "Lineaire hypotheek"

        #validity since date search

        validString = sel.xpath(
            "//p[contains(@class, 'small-font') and contains(text(),'Deze tarieven gelden voor nieuwe offertes en renteaanpassingen voor bestaande hypotheken uitgebracht vanaf')]/text()"
        ).extract()

        validString = doFormattingUnicode(
            str(validString[0]).encode("utf-8").strip())

        regex = r"([0-9]+\s+[a-z]+\s+[0-9])\w+"
        pattern = re.compile(regex)
        matches = re.search(regex, validString, re.DOTALL)
        if matches:
            date = changeMonth(matches.group(0))
            date = parse(date)
            date = str(date).split()[0]

        ing_scraper(productName1, sel, date)
        ing_scraper(productName2, sel, date)

        self.driver.close()

        #ING Bank II

        self.driver = webdriver.PhantomJS()
        self.driver.set_window_size(1120, 550)

        url2 = 'https://www.ing.nl/particulier/hypotheken/actuele-hypotheekrente/actuele-hypotheekrente-andere-hypotheken/index.html'
        self.driver.get(url2)
        sel = scrapy.Selector(text=self.driver.page_source)
        log.msg("URL ----- %s" % url2, level=log.DEBUG)
        validString = sel.xpath(
            "//small[contains(text(),'Deze tarieven gelden voor nieuwe offertes en renteaanpassingen voor bestaande hypotheken uitgebracht vanaf')]/text()"
        ).extract()
        log.msg("String ----> %s" % validString, level=log.DEBUG)
        validString = doFormattingUnicode(
            (validString[0]).encode("utf-8").strip())

        regex = r"([0-9]+\s+[a-z]+\s+[0-9])\w+"
        pattern = re.compile(regex)
        matches = re.search(regex, validString, re.DOTALL)
        if matches:
            date = changeMonth(matches.group(0))
            date = parse(date)
            date = str(date).split()[0]

        productName3 = "Aflossingsvrije hypotheek"
        productName4 = "Bankspaarhypotheek"

        ing_scraper(productName3, sel, date)
        ing_scraper(productName4, sel, date)

        #Rabobank
        self.driver = webdriver.PhantomJS()
        self.driver.set_window_size(1120, 550)

        url2 = 'https://www.rabobank.nl/particulieren/hypotheek/hypotheekrente/?intcamp=pa-hypotheek&inttype=tegel-hypotheekrente&intsource=hypotheek'
        self.driver.get(url2)
        sel = scrapy.Selector(text=self.driver.page_source)
        log.msg("URL ----- %s" % url2, level=log.DEBUG)

        divRows = sel.xpath("//*[@class='s14-lamella--shadow']")

        #target table headings (h3)
        target1 = [u'Alle rentepercentages hypotheek met Basisvoorwaarden']
        target2 = [u'Alle rentepercentages hypotheek met Plusvoorwaarden']

        #validity since date search
        validString = sel.xpath(
            "//li[contains(text(),'totdat wij de tarieven wijzigen')]/text()"
        ).extract()
        validString = changeMonth(doFormattingUnicode(str(validString)))

        regex = r"([0-9]+\s+[a-z]+\s+[0-9])\w+"
        pattern = re.compile(regex)
        matches = re.search(regex, validString, re.DOTALL)
        if matches:
            date = parse(matches.group(0))
            date = str(date).split()[0]

        f = open("output.csv", "a+")

        for divRow in divRows:
            head = divRow.xpath("div/h2/text()").extract()
            if (head == target1):
                divRow = divRow.xpath("div")
                rabo_scraper(divRow, date)

            if (head == target2):
                #log.msg("div row ----- %s" % str(divRow.xpath("div/div/table").extract()), level = log.DEBUG)
                divRow = divRow.xpath("div")
                rabo_scraper(divRow, date)

            f.close()

        return items