コード例 #1
0
def fetch():
    LogI("Fetching Coop discounts...")
    start_time = time.time() * 1000

    root_url = 'http://www.coop.nl'
    index_url = root_url + '/aanbiedingen'

    try:
        response = requests.get(index_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce))
        return

    try:
        soup = bs4.BeautifulSoup(response.text)
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0]))
        return

    count = 0
    failedcount = 0
    totalexceptions = 0

    category_divs = soup.findAll('div', {'class': 'deal'})
    for div in category_divs:
        exceptioncount = 0
        temp_data = {}
        temp_data = models.defaultModel.copy()
        temp_data['supermarket'] = 'coop'
        temp_data['url'] = index_url

        # PRODUCTNAME
        try:
            temp_data['productname'] = div.find('h3').get_text()
        except:
            LogE("[IGNORING] Productname not found",
                 "{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass

        # DURATION
        try:
            temp_data['duration'] = soup.select(
                'div#ctl00_ctl00_ContentPlaceHolderMain_cpLeftAndContent_Header2_divTextLink div.periode strong'
            )[0].get_text() + " t/m " + soup.select(
                'div#ctl00_ctl00_ContentPlaceHolderMain_cpLeftAndContent_Header2_divTextLink div.periode strong'
            )[1].get_text()
        except IndexError as e:
            LogE("[IGNORING] Duration not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # IMAGE
        try:
            temp_data['image'] = root_url + div.find('img').get('src')
        except:
            LogE("[IGNORING] Image not found", "{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass

        # DESCRIPTION
        try:
            temp_data['description'] = ''
        except IndexError as e:
            LogE("[IGNORING] Description not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # AMOUNT
        try:
            temp_data['amount'] = ", ".join(
                str(info.get_text())
                for info in div.select('div.deal-info ul li'))
        except IndexError as e:
            LogE("[IGNORING] Amount not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # ACTION PRICE
        tempActPrice = div.select('span.deal-euros')
        if tempActPrice:
            temp_data['action_price'] = tempActPrice[0].get_text()
        else:
            if div.select('div.i50procentkorting'):
                temp_data['action_price'] = "50% korting"
            elif div.select('div.i25procentkorting'):
                temp_data['action_price'] = "25% korting"
            elif div.select('div.i2halen1betalen'):
                temp_data['action_price'] = "2 halen, 1 betalen"
            else:
                LogE("[IGNORING] Action price not found",
                     "{0}".format("Skipped all possible options"))
                exceptioncount = exceptioncount + 1

        count = count + 1
        totalexceptions = totalexceptions + exceptioncount
        if exceptioncount > settings.maxErrors:
            LogE("Too much missing info, skipping this discount",
                 "{0} Errors occured".format(exceptioncount))
            failedcount = failedcount + 1
        else:
            db.insert(temp_data)
            LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count,
                                                    temp_data['productname']))

        if failedcount > settings.maxFailedDiscounts:
            LogE(
                "Skipping this supermarket, too much missing info.",
                "More than {0} discounts missing too much info".format(
                    settings.maxFailedDiscounts))
            LogI("Skipping this supermarket, too much missing info")
            LogI("More than {0} discounts missing too much info".format(
                settings.maxFailedDiscounts))
            return

    seconds = (time.time() * 1000) - start_time
    LogI(
        "Done fetching {0} Coop discounts in {1}ms. {2} errors occured and ignored.\n"
        .format(count, format(seconds, '.2f'), totalexceptions))
コード例 #2
0
ファイル: deka.py プロジェクト: aldershoff/SupermarketScraper
def get_actie_data(actie_page_url):
    try:
        response = requests.get(actie_page_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce))
        return

    try:
        soup = bs4.BeautifulSoup(response.text)
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0]))
        return

    global count
    global failedcount
    global totalexceptions
    global duration

    category_divs = soup.findAll('div', {'class': 'aanbieding'})
    for div in category_divs:
        exceptioncount = 0
        temp_data = {}
        temp_data = models.defaultModel.copy()
        temp_data['supermarket'] = 'deka'
        temp_data['url'] = actie_page_url
        try:
            temp_data['productname'] = div.find('h2').get_text()
        except:
            LogE("[IGNORING] Productname not found",
                 "{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass

        try:
            temp_data['duration'] = ''
        except:
            LogE("[IGNORING] Duration not found",
                 "{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass

        try:
            temp_data['description'] = div.select('div.text')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Description not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        try:
            temp_data['image'] = root_url + div.find('img').get('src')
        except:
            LogE("[IGNORING] Image not found", "{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass

        try:
            temp_data['amount'] = getAmount(
                div.find('div', {
                    'class': re.compile("tag")
                }).get('class')[1].replace('tag', ''))
        except:
            LogE("[IGNORING] Amount not found",
                 "{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass

        try:
            temp_data['action_price'] = div.select(
                'span.current span.whole')[0].get_text() + div.select(
                    'span.current span.part')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Action price not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        try:
            temp_data['old_price'] = div.select('span.old span.whole')[
                0].get_text() + div.select('span.old span.part')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Old price not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        count = count + 1
        totalexceptions = totalexceptions + exceptioncount
        if exceptioncount > settings.maxErrors:
            LogE("Too much missing info, skipping this discount",
                 "{0} Errors occured".format(exceptioncount))
            failedcount = failedcount + 1
        else:
            db.insert(temp_data)
            LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count,
                                                    temp_data['productname']))

        if failedcount > settings.maxFailedDiscounts:
            LogE(
                "Skipping this supermarket, too much missing info.",
                "More than {0} discounts missing too much info".format(
                    settings.maxFailedDiscounts))
            LogI("Skipping this supermarket, too much missing info")
            LogI("More than {0} discounts missing too much info".format(
                settings.maxFailedDiscounts))
            return
コード例 #3
0
ファイル: ah.py プロジェクト: tonsmets/SupermarketScraper
def fetch():
    LogI("Fetching AH discounts...")
    start_time = time.time() * 1000
    
    index_url = 'http://www.ah.nl/bonus'

    try:
        r = requests.get(index_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce))
        return
    
    try:
        soup = bs4.BeautifulSoup(r.text, 'html5lib')
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0]))
        return

    count = 0
    failedcount = 0
    totalexceptions = 0

    bonus_products = soup.findAll(attrs={'data-class': 'product'})
    for bonus in bonus_products:
        exceptioncount = 0
        superdata = {}
        superdata = models.defaultModel.copy()
        superdata['supermarket'] = 'ah'

        # URL
        try:
            superdata['url'] = "http://www.ah.nl" + bonus.select('div.detail a')[0].get('href')
            superdata['url'] = "http://www.ah.nl" + bonus.get('href')
        except (IndexError, TypeError) as e:
            if superdata['url'] is None:
                LogE("[IGNORING] Error","{0}".format(e))
                exceptioncount = exceptioncount + 1
            pass

        # PRODUCTNAME
        try:
            superdata['productname'] = bonus.select('div.detail h2')[0].get_text().strip()
        except IndexError as e:
            LogE("[IGNORING] Productname not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # DURATION
        try:
            #superdata['duration'] = soup.select('div.columns p.header-bar__term')[0].get_text()
            superdata['duration'] = 'This week'
        except IndexError as e:
            LogE("[IGNORING] Duration not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # IMAGE
        try:
            superdata['image'] = bonus.select('div.image img')[0].get('data-original')
        except IndexError as e:
            LogE("[IGNORING] Image not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass
        
        # AMOUNT
        try:
            tempAmount = bonus.select('div.image p.unit')[0].get_text().strip()
            if tempAmount is not None:
                superdata['amount'] = tempAmount
        except IndexError as e:
            LogE("[IGNORING] Amount not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass
        
        # BONUS
        try:
            superdata['bonus'] = bonus.select('div.shield')[0].get_text().strip()
        except IndexError as e:
            LogE("[IGNORING] Bonus not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # ACTION PRICE    
        try:
            superdata['action_price'] = bonus.select('p.price ins')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Action price not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # OLD PRICE
        try:
            superdata['old_price'] = bonus.select('p.price del')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Old price not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        count = count + 1
        totalexceptions = totalexceptions + exceptioncount
        if exceptioncount > settings.maxErrors:
            LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount))
            failedcount = failedcount + 1
        else:
            db.insert(superdata)
            LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, superdata['productname']))
        
        if failedcount > settings.maxFailedDiscounts:
            LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
            LogI("Skipping this supermarket, too much missing info")
            LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
            return

    seconds = (time.time() * 1000) - start_time
    LogI("Done fetching {0} AH discounts in {1}ms. {2} errors occured and ignored.\n".format(count, format(seconds, '.2f'), totalexceptions))
コード例 #4
0
ファイル: coop.py プロジェクト: tonsmets/SupermarketScraper
def fetch():
    LogI("Fetching Coop discounts...")
    start_time = time.time() * 1000

    root_url = 'http://www.coop.nl'
    index_url = root_url + '/aanbiedingen'
    
    try:
        response = requests.get(index_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce))
        return

    try:
        soup = bs4.BeautifulSoup(response.text)
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0]))
        return

    count = 0
    failedcount = 0
    totalexceptions = 0

    category_divs = soup.findAll('div', {'class':'deal'})
    for div in category_divs:
        exceptioncount = 0
        temp_data = {}
        temp_data = models.defaultModel.copy()
        temp_data['supermarket'] = 'coop'
        temp_data['url'] = index_url

        # PRODUCTNAME
        try:
            temp_data['productname'] = div.find('h3').get_text()
        except:
            LogE("[IGNORING] Productname not found","{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass

        # DURATION
        try:
            temp_data['duration'] = soup.select('div#ctl00_ctl00_ContentPlaceHolderMain_cpLeftAndContent_Header2_divTextLink div.periode strong')[0].get_text() + " t/m " + soup.select('div#ctl00_ctl00_ContentPlaceHolderMain_cpLeftAndContent_Header2_divTextLink div.periode strong')[1].get_text()
        except IndexError as e:
            LogE("[IGNORING] Duration not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # IMAGE
        try:
            temp_data['image'] = root_url + div.find('img').get('src')
        except:
            LogE("[IGNORING] Image not found","{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass
        
        # DESCRIPTION
        try:
            temp_data['description'] = ''
        except IndexError as e:
            LogE("[IGNORING] Description not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # AMOUNT
        try:
            temp_data['amount'] = ", ".join(str(info.get_text()) for info in div.select('div.deal-info ul li'))
        except IndexError as e:
            LogE("[IGNORING] Amount not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # ACTION PRICE
        tempActPrice = div.select('span.deal-euros')
        if tempActPrice:
            temp_data['action_price'] = tempActPrice[0].get_text()
        else:
            if div.select('div.i50procentkorting'):
                temp_data['action_price'] = "50% korting"
            elif div.select('div.i25procentkorting'):
                temp_data['action_price'] = "25% korting"
            elif div.select('div.i2halen1betalen'):
                temp_data['action_price'] = "2 halen, 1 betalen"
            else:
                LogE("[IGNORING] Action price not found","{0}".format("Skipped all possible options"))
                exceptioncount = exceptioncount + 1

        count = count + 1
        totalexceptions = totalexceptions + exceptioncount
        if exceptioncount > settings.maxErrors:
            LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount))
            failedcount = failedcount + 1
        else:
            db.insert(temp_data)
            LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, temp_data['productname']))
        
        if failedcount > settings.maxFailedDiscounts:
            LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
            LogI("Skipping this supermarket, too much missing info")
            LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
            return

    seconds = (time.time() * 1000) - start_time
    LogI("Done fetching {0} Coop discounts in {1}ms. {2} errors occured and ignored.\n".format(count, format(seconds, '.2f'), totalexceptions))
コード例 #5
0
def fetch():
    LogI("Fetching C1000 discounts...")
    start_time = time.time() * 1000

    root_url = 'http://www.c1000.nl/'
    index_url = root_url + 'aanbiedingen'

    try:
        response = requests.get(index_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce))
        return

    try:
        soup = bs4.BeautifulSoup(response.text)
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0]))
        return

    count = 0
    failedcount = 0
    totalexceptions = 0

    category_divs = soup.findAll(
        'div',
        id=re.compile(
            "^content_0_contentrij1_0_weekAanbiedingen_listViewCategorieen_"))
    for div in category_divs:
        list_items = div.findAll('li')
        for li in list_items:
            exceptioncount = 0
            temp_data = {}
            temp_data = models.defaultModel.copy()
            temp_data['supermarket'] = 'c1000'

            # URL
            temp_data['url'] = index_url

            # PRODUCTNAME
            try:
                temp_data['productname'] = li.find('h2').get_text().strip()
            except:
                LogE("[IGNORING] Productname not found",
                     "{0}".format(sys.exc_info()[0]))
                exceptioncount = exceptioncount + 1
                pass

            # DURATION
            try:
                temp_data['duration'] = re.sub(
                    r'[\t\r\n]', '',
                    soup.find(
                        'a', {
                            'id': 'content_0_contentrij1_0_linkTabHuidigeWeek'
                        }).get_text()).strip().replace('                     ',
                                                       ' ')
            except:
                LogE("[IGNORING] Productname not found",
                     "{0}".format(sys.exc_info()[0]))
                exceptioncount = exceptioncount + 1
                pass

            # DESCRIPTION
            try:
                temp_data['description'] = re.sub(
                    r'[\t\r\n]', '',
                    li.select('div.product_details p')[0].get_text().strip())
            except IndexError as e:
                LogE("[IGNORING] Description not found", "{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass

            # IMAGE
            try:
                temp_data['image'] = root_url + li.find('img').get('src')
            except:
                LogE("[IGNORING] Image not found",
                     "{0}".format(sys.exc_info()[0]))
                exceptioncount = exceptioncount + 1
                pass

            # AMOUNT
            try:
                temp_data['amount'] = li.select(
                    'div.product_details p')[0].get_text().strip()
                temp_data['amount'] = li.select(
                    'div.pricetag em')[0].get_text().strip()
            except IndexError as e:
                if temp_data['amount'] is None:
                    LogE("[IGNORING] Amount not found", "{0}".format(e))
                    exceptioncount = exceptioncount + 1
                pass

            # ACTION PRICE
            # derp = li.select('div.pricetag strong')
            # if derp is not None:
            #   derp = derp[0].get_text().strip()
            try:
                temp_data['action_price'] = li.select(
                    'div.pricetag strong')[0].get_text().strip()
            except (IndexError, TypeError) as e:
                try:
                    temp_data['action_price'] = li.select('img.visual')[1].get(
                        'alt').strip()
                except IndexError as e:
                    pass
                if temp_data['action_price'] is None:
                    LogE("[IGNORING] Action price not found", "{0}".format(e))
                    exceptioncount = exceptioncount + 1
                pass

            # OLD PRICE
            try:
                temp_data['old_price'] = li.select('del')[0].get_text().strip()
            except IndexError as e:
                LogE("[IGNORING] Old price not found", "{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass

            count = count + 1
            totalexceptions = totalexceptions + exceptioncount
            if exceptioncount > settings.maxErrors:
                LogE("Too much missing info, skipping this discount",
                     "{0} Errors occured".format(exceptioncount))
                failedcount = failedcount + 1
            else:
                db.insert(temp_data)
                LogD("[{0}] ({1}) Fetched '{2}'".format(
                    exceptioncount, count, temp_data['productname']))

            if failedcount > settings.maxFailedDiscounts:
                LogE(
                    "Skipping this supermarket, too much missing info.",
                    "More than {0} discounts missing too much info".format(
                        settings.maxFailedDiscounts))
                LogI("Skipping this supermarket, too much missing info")
                LogI("More than {0} discounts missing too much info".format(
                    settings.maxFailedDiscounts))
                return

    seconds = (time.time() * 1000) - start_time
    LogI(
        "Done fetching {0} C1000 discounts in {1}ms. {2} errors occured and ignored.\n"
        .format(count, format(seconds, '.2f'), totalexceptions))
コード例 #6
0
def fetch():
    LogI("Fetching Poiesz discounts...")
    start_time = time.time() * 1000
    
    index_url = 'http://www.poiesz-supermarkten.nl/aanbiedingen'

    try:
        r = requests.get(index_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce))
        return
    
    try:
        soup = bs4.BeautifulSoup(r.text, 'html5lib')
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0]))
        return

    count = 0
    failedcount = 0
    totalexceptions = 0

    discounts = soup.select('div.meevaller')
    for discount in discounts:
        exceptioncount = 0
        superdata = {}
        superdata = models.defaultModel.copy()
        superdata['supermarket'] = 'poiesz'

        # URL
        try:
            superdata['url'] = index_url
        except (IndexError, TypeError) as e:
            if superdata['url'] is None:
                LogE("[IGNORING] Error","{0}".format(e))
                exceptioncount = exceptioncount + 1
            pass

        # PRODUCTNAME
        try:
            superdata['productname'] = discount.select('h2')[0].get_text().strip()
        except IndexError as e:
            LogE("[IGNORING] Productname not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # DURATION
        try:
            superdata['duration'] = soup.select('div.validThrough')[0].get_text().strip()
        except IndexError as e:
            LogE("[IGNORING] Duration not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # IMAGE
        try:
            superdata['image'] = "http://www.poiesz-supermarkten.nl/" + discount.select('img')[0].get('src')
        except IndexError as e:
            LogE("[IGNORING] Image not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass
        
        # AMOUNT
        try:
            tempAmount = discount.select('div.shieldNew div.top')[0].get_text().strip()
            tempAmount = ' '.join(tempAmount.split())
            if tempAmount is not None:
                superdata['amount'] = tempAmount
        except IndexError as e:
            LogE("[IGNORING] Amount not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # DESCRIPTION  
        try:
            subtitles = discount.select('div.subtitle')
            for sub in subtitles:
                if sub.get_text() is not None:
                    superdata['description'] = superdata['description'] + sub.get_text().strip() + ' '
        except IndexError as e:
            LogE("[IGNORING] Description not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # ACTION PRICE    
        try:
            tempPrice = discount.select('div.forPrice div.whole')[0].get_text().strip() + "." + discount.select('div.forPrice div.part')[0].get_text().strip()
            superdata['action_price'] = ' '.join(tempPrice.split())
        except IndexError as e:
            try:
                tempPrice = discount.select('div.forPrice div.combined')[0].get_text().strip()
                superdata['action_price'] = ' '.join(tempPrice.split())
            except IndexError as ex:  
                if superdata['action_price'] is None:
                    LogE("[IGNORING] Action price not found","{0}".format(e))
                    exceptioncount = exceptioncount + 1
                pass
            pass

        # OLD PRICE
        try:
            tempOld = discount.select('div.fromPriceWrap')[0].get_text().strip()
            superdata['old_price'] = ' '.join(tempOld.split())
        except IndexError as e:
            LogE("[IGNORING] Old price not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        count = count + 1
        totalexceptions = totalexceptions + exceptioncount
        if exceptioncount > settings.maxErrors:
            LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount))
            failedcount = failedcount + 1
        else:
            db.insert(superdata)
            LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, superdata['productname']))
        
        if failedcount > settings.maxFailedDiscounts:
            LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
            LogI("Skipping this supermarket, too much missing info")
            LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
            return

    seconds = (time.time() * 1000) - start_time
    LogI("Done fetching {0} Poiesz discounts in {1}ms. {2} errors occured and ignored.\n".format(count, format(seconds, '.2f'), totalexceptions))
コード例 #7
0
ファイル: ah.py プロジェクト: aldershoff/SupermarketScraper
def fetch():
    LogI("Fetching AH discounts...")
    start_time = time.time() * 1000

    index_url = 'http://www.ah.nl/bonus'

    try:
        r = requests.get(index_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce))
        return

    try:
        soup = bs4.BeautifulSoup(r.text, 'html5lib')
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0]))
        return

    count = 0
    failedcount = 0
    totalexceptions = 0

    bonus_products = soup.findAll(attrs={'data-class': 'product'})
    for bonus in bonus_products:
        exceptioncount = 0
        superdata = {}
        superdata = models.defaultModel.copy()
        superdata['supermarket'] = 'ah'

        # URL
        try:
            superdata['url'] = "http://www.ah.nl" + bonus.select(
                'div.detail a')[0].get('href')
            superdata['url'] = "http://www.ah.nl" + bonus.get('href')
        except (IndexError, TypeError) as e:
            if superdata['url'] is None:
                LogE("[IGNORING] Error", "{0}".format(e))
                exceptioncount = exceptioncount + 1
            pass

        # PRODUCTNAME
        try:
            superdata['productname'] = bonus.select(
                'div.detail h2')[0].get_text().strip()
        except IndexError as e:
            LogE("[IGNORING] Productname not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # DURATION
        try:
            #superdata['duration'] = soup.select('div.columns p.header-bar__term')[0].get_text()
            superdata['duration'] = 'This week'
        except IndexError as e:
            LogE("[IGNORING] Duration not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # IMAGE
        try:
            superdata['image'] = bonus.select('div.image img')[0].get(
                'data-original')
        except IndexError as e:
            LogE("[IGNORING] Image not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # AMOUNT
        try:
            tempAmount = bonus.select('div.image p.unit')[0].get_text().strip()
            if tempAmount is not None:
                superdata['amount'] = tempAmount
        except IndexError as e:
            LogE("[IGNORING] Amount not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # BONUS
        try:
            superdata['bonus'] = bonus.select(
                'div.shield')[0].get_text().strip()
        except IndexError as e:
            LogE("[IGNORING] Bonus not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # ACTION PRICE
        try:
            superdata['action_price'] = bonus.select(
                'p.price ins')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Action price not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        # OLD PRICE
        try:
            superdata['old_price'] = bonus.select('p.price del')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Old price not found", "{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        count = count + 1
        totalexceptions = totalexceptions + exceptioncount
        if exceptioncount > settings.maxErrors:
            LogE("Too much missing info, skipping this discount",
                 "{0} Errors occured".format(exceptioncount))
            failedcount = failedcount + 1
        else:
            db.insert(superdata)
            LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count,
                                                    superdata['productname']))

        if failedcount > settings.maxFailedDiscounts:
            LogE(
                "Skipping this supermarket, too much missing info.",
                "More than {0} discounts missing too much info".format(
                    settings.maxFailedDiscounts))
            LogI("Skipping this supermarket, too much missing info")
            LogI("More than {0} discounts missing too much info".format(
                settings.maxFailedDiscounts))
            return

    seconds = (time.time() * 1000) - start_time
    LogI(
        "Done fetching {0} AH discounts in {1}ms. {2} errors occured and ignored.\n"
        .format(count, format(seconds, '.2f'), totalexceptions))
コード例 #8
0
def fetch():
    LogI("Fetching Jan Linders discounts...")
    start_time = time.time() * 1000

    root_url = 'http://www.janlinders.nl'
    index_url = root_url + '/acties/weekacties/'

    count = 0
    failedcount = 0
    totalexceptions = 0

    try:
        response = requests.get(index_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce))
        return

    try:
        soup = bs4.BeautifulSoup(response.text, 'html5lib')
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0]))
        return

    category_divs = soup.find_all('div', class_=re.compile('dots_\d+'))

    for div in category_divs:
        div_items = div.findAll('div', {'class': 'hover_discount_product'})
        for actdiv in div_items:
            exceptioncount = 0
            temp_data = {}
            temp_data = models.defaultModel.copy()
            temp_data['supermarket'] = 'janlinders'
            temp_data['url'] = index_url

            if actdiv.select('div.action b'):
                temp_data['productname'] = actdiv.select(
                    'div.action b')[0].get_text() + " " + actdiv.select(
                        'div.action h4')[0].get_text().replace('\n', ' ')
            else:
                try:
                    temp_data['productname'] = actdiv.select(
                        'div.action h4')[0].get_text().replace('\n', ' ')
                except IndexError as e:
                    LogE("[IGNORING] Productname not found", "{0}".format(e))
                    exceptioncount = exceptioncount + 1
                    pass

            try:
                temp_data['duration'] = soup.select(
                    'div.date-small')[0].get_text()
            except IndexError as e:
                LogE("[IGNORING] Duration not found", "{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass

            try:
                tempOldPrice = actdiv.select('.oldprice')[0].get_text()
                if tempOldPrice is not None and tempOldPrice != '' and tempOldPrice is not '':
                    temp_data['old_price'] = tempOldPrice
            except IndexError as e:
                LogE("[IGNORING] Old price not found", "{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass

            try:
                tempamount = actdiv.find('div',
                                         {'class': re.compile("^description")})
                for e in tempamount.findAll('h4'):
                    e.extract()
                for e in tempamount.findAll('b'):
                    e.extract()
                for e in tempamount.findAll('span'):
                    e.extract()
                for e in tempamount.findAll('div'):
                    e.extract()
                temp_data['amount'] = tempamount.get_text().replace('\n', ' ')

                try:
                    temp_data['amount'] += ". " + actdiv.select(
                        'div.x_price_w_amount span.small')[0].get_text()
                except:
                    pass
            except:
                LogE("[IGNORING] Amount not found",
                     "{0}".format(sys.exc_info()[0]))
                exceptioncount = exceptioncount + 1
                pass

            try:
                temp_data['image'] = root_url + actdiv.find('img').get('src')
            except:
                LogE("[IGNORING] Image not found",
                     "{0}".format(sys.exc_info()[0]))
                exceptioncount = exceptioncount + 1
                pass

            if actdiv.select(
                    'div.action div.regular_price span.big') and actdiv.select(
                        'div.action div.regular_price span.small'):
                try:
                    temp_data['action_price'] = actdiv.select(
                        'div.action div.regular_price span.big'
                    )[0].get_text() + "." + actdiv.select(
                        'div.action div.regular_price span.small')[0].get_text(
                        )
                except:
                    LogE("[IGNORING] Action price not found", "None")
                    exceptioncount = exceptioncount + 1
                    pass
            elif actdiv.select(
                    'div.x_price_w_amount div.small') and actdiv.select(
                        'div.x_price_w_amount div.small') and actdiv.select(
                            'div.x_price_w_amount div.big'):
                try:
                    temp_data['action_price'] = actdiv.select(
                        'div.x_price_w_amount div.small'
                    )[0].get_text() + " " + actdiv.select(
                        'div.x_price_w_amount div.big')[0].get_text(
                        ) + "." + actdiv.select(
                            'div.x_price_w_amount div.small')[1].get_text()
                except:
                    LogE("[IGNORING] Action price not found", "None")
                    exceptioncount = exceptioncount + 1
                    pass
            elif actdiv.select('div.x_free div.big') and actdiv.select(
                    'div.x_free div.small'):
                try:
                    temp_data['action_price'] = ' '.join(
                        actdiv.select('div.x_free div.big')
                        [0].get_text().strip().split()) + " " + ' '.join(
                            actdiv.select('div.x_free div.small')
                            [0].get_text().strip().split())
                except:
                    LogE("[IGNORING] Action price not found", "None")
                    exceptioncount = exceptioncount + 1
                    pass
            elif actdiv.select('div.regular_price div.big') and actdiv.select(
                    'div.regular_price div.small'):
                try:
                    temp_data['action_price'] = actdiv.select(
                        'div.regular_price div.big')[0].get_text(
                        ) + "." + actdiv.select(
                            'div.regular_price div.small')[0].get_text()
                except:
                    LogE("[IGNORING] Action price not found", "None")
                    exceptioncount = exceptioncount + 1
                    pass
            #elif actdiv.select('div.price div.big') and actdiv.select('div.price div.small'):
            #temp_data['action_price'] = actdiv.select('div.price div.big')[0].get_text() + "." + actdiv.select('div.price div.small')[0].get_text()

            elif actdiv.select(
                    'div.action div.x_discount div.big') and actdiv.select(
                        'div.action div.x_discount div.small'):
                try:
                    temp_data['action_price'] = ' '.join(
                        actdiv.select('div.action div.x_discount div.big')
                        [0].get_text().strip().split()) + " " + actdiv.select(
                            'div.action div.x_discount div.small')[0].get_text(
                            )
                except:
                    LogE("[IGNORING] Action price not found", "None")
                    exceptioncount = exceptioncount + 1
                    pass
            elif actdiv.select('div.x_get_for div.small'):
                try:
                    temp_data['action_price'] = ' '.join(
                        actdiv.select('div.x_get_for div.small')
                        [0].get_text().strip().split()) + ", " + actdiv.select(
                            'div.x_get_for div.small')[1].get_text().strip()
                except:
                    LogE("[IGNORING] Action price not found", "None")
                    exceptioncount = exceptioncount + 1
                    pass
            elif actdiv.select('div.x_half_price div.small'):
                try:
                    temp_data['action_price'] = ' '.join(
                        actdiv.select('div.x_half_price div.small')
                        [0].get_text().strip().split()) + " " + actdiv.select(
                            'div.x_half_price div.small')[1].get_text().strip(
                            )
                except:
                    LogE("[IGNORING] Action price not found", "None")
                    exceptioncount = exceptioncount + 1
                    pass
            else:
                LogE("[IGNORING] Action price not found", "None")
                exceptioncount = exceptioncount + 1
            totalexceptions = totalexceptions + exceptioncount

            count = count + 1
            if exceptioncount > settings.maxErrors:
                LogE("Too much missing info, skipping this discount",
                     "{0} Errors occured".format(exceptioncount))
                failedcount = failedcount + 1
            else:
                db.insert(temp_data)
                LogD("[{0}] ({1}) Fetched '{2}'".format(
                    exceptioncount, count, temp_data['productname']))

            if failedcount > settings.maxFailedDiscounts:
                LogE(
                    "Skipping this supermarket, too much missing info.",
                    "More than {0} discounts missing too much info".format(
                        settings.maxFailedDiscounts))
                LogI("Skipping this supermarket, too much missing info")
                LogI("More than {0} discounts missing too much info".format(
                    settings.maxFailedDiscounts))
                return

    seconds = (time.time() * 1000) - start_time
    LogI(
        "Done fetching {0} Jan Linders discounts in {1}ms. {2} errors occured and ignored.\n"
        .format(count, format(seconds, '.2f'), totalexceptions))
コード例 #9
0
ファイル: dirk.py プロジェクト: tonsmets/SupermarketScraper
def get_actie_data(actie_page_url):
    global count
    global failedcount
    global totalexceptions
    exceptioncount = 0
    actie_data = {}
    actie_data = models.defaultModel.copy()
    actie_data['supermarket'] = 'dirk'
    url = root_url + actie_page_url
    try:
        response = requests.get(url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce))
        return

    try:
        soup = bs4.BeautifulSoup(response.text)
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0]))
        return

    actie_data['url'] = root_url + actie_page_url

    try:
        actie_data['productname'] = soup.find('h2').get_text()
    except:
        LogE("[IGNORING] Productname not found","{0}".format(sys.exc_info()[0]))
        exceptioncount = exceptioncount + 1
        pass

    try:
        actie_data['duration'] = soup.select('div.fromTill')[0].get_text().strip()
    except IndexError as e:
        LogE("[IGNORING] Duration not found","{0}".format(e))
        exceptioncount = exceptioncount + 1
        pass

    try:
        amount = soup.select('div.subtitle')[0].get_text().strip()
        if(amount != '' and amount != ' ' and amount is not None):
            actie_data['amount'] = amount 
    except IndexError as e:
        LogE("[IGNORING] Amount not found","{0}".format(e))
        exceptioncount = exceptioncount + 1
        pass

    try:
        div_style = soup.find('div', {'class':'image'})['style']
        style = cssutils.parseStyle(div_style)
        url = style['background-image']
        url = url.replace('url(', '').replace(')', '')
        actie_data['image'] = root_url + url
    except:
        LogE("[IGNORING] Image not found","{0}".format(sys.exc_info()[0]))
        exceptioncount = exceptioncount + 1
        pass

    try:
        actie_data['action_price'] = soup.select('div.star')[0].get('title').strip().replace(u"\u20AC ","").replace(",",".")
    except IndexError as e:
        LogE("[IGNORING] Action price not found","{0}".format(e))
        exceptioncount = exceptioncount + 1
        pass

    try:
        actie_data['old_price'] = soup.select('span.stripe')[0].get_text()
    except IndexError as e:
        LogE("[IGNORING] Old price not found","{0}".format(e))
        exceptioncount = exceptioncount + 1
        pass

    totalexceptions = totalexceptions + exceptioncount

    count = count + 1
    if exceptioncount > settings.maxErrors:
        LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount))
        failedcount = failedcount + 1
    else:
        db.insert(actie_data)
        LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, actie_data['productname']))
    
    if failedcount > settings.maxFailedDiscounts:
        LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
        LogI("Skipping this supermarket, too much missing info")
        LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
        return
コード例 #10
0
ファイル: aldi.py プロジェクト: aldershoff/SupermarketScraper
def fetch():
    LogI("Fetching Aldi discounts...")
    start_time = time.time() * 1000
    
    index_url = 'http://www.aldi.nl/'

    try:
        r = requests.get(index_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce))
        return
    
    try:
        soup = bs4.BeautifulSoup(r.text, 'html5lib')
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0]))
        return

    count = 0
    failedcount = 0
    totalexceptions = 0

    pages = soup.select('ul#ul_menu_142002 li')[0]
    pages = pages.select('ul li a')
    for page in pages:
        try:
            r = requests.get(index_url + page.get('href'), headers=settings.headers)
        except requests.exceptions.ConnectionError as ce:
            LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce))
            return
        
        try:
            soup = bs4.BeautifulSoup(r.text, 'html5lib')
            soup.encode('utf-8')
        except:
            LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0]))
            return

        duration = "n/a"

        try:
            duration = "Vanaf " + soup.select('li.active h2.tab-headline span')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Duration not found","{0}".format(e))
            totalexceptions = totalexceptions + 1
            pass
        

        discounts = soup.select('div.product-tile')
        for discount in discounts:
            exceptioncount = 0
            superdata = {}
            superdata = models.defaultModel.copy()
            superdata['supermarket'] = 'aldi'

            # URL
            try:
                superdata['url'] = index_url + discount.select('a')[1].get('href')
            except (IndexError, TypeError) as e:
                if superdata['url'] is None:
                    LogE("[IGNORING] Error","{0}".format(e))
                    exceptioncount = exceptioncount + 1
                pass

            # PRODUCTNAME
            try:
                superdata['productname'] = discount.select('h3')[0].get_text().strip()
            except IndexError as e:
                LogE("[IGNORING] Productname not found","{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass  

            # DURATION
            try:
                #superdata['duration'] = soup.select('div.columns p.header-bar__term')[0].get_text()
                superdata['duration'] = duration
            except IndexError as e:
                LogE("[IGNORING] Duration not found","{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass

            # IMAGE
            try:
                superdata['image'] = index_url + discount.select('img')[0].get('src')
            except IndexError as e:
                LogE("[IGNORING] Image not found","{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass
        
            # AMOUNT
            try:
                tempAmount = discount.select('div.unit')[0].get_text().strip()
                if tempAmount is not None:
                    superdata['amount'] = tempAmount
            except IndexError as e:
                LogE("[IGNORING] Amount not found","{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass

            # ACTION PRICE    
            try:
                superdata['action_price'] = discount.select('strong')[0].get_text().replace('*','')
            except IndexError as e:
                LogE("[IGNORING] Action price not found","{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass

            # DESCRIPTION  
            try:
                superdata['description'] = discount.select('div.richtext')[0].get_text().strip()
            except IndexError as e:
                LogE("[IGNORING] Description not found","{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass

            # OLD PRICE
            try:
                superdata['old_price'] = "n/a"
            except IndexError as e:
                LogE("[IGNORING] Old price not found","{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass

            count = count + 1
            totalexceptions = totalexceptions + exceptioncount
            if exceptioncount > settings.maxErrors:
                LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount))
                failedcount = failedcount + 1
            else:
                db.insert(superdata)
                LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, superdata['productname']))
            
            if failedcount > settings.maxFailedDiscounts:
                LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
                LogI("Skipping this supermarket, too much missing info")
                LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
                return

    seconds = (time.time() * 1000) - start_time
    LogI("Done fetching {0} Aldi discounts in {1}ms. {2} errors occured and ignored.\n".format(count, format(seconds, '.2f'), totalexceptions))
コード例 #11
0
ファイル: deka.py プロジェクト: tonsmets/SupermarketScraper
def get_actie_data(actie_page_url):
    try:
        response = requests.get(actie_page_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce))
        return

    try:
        soup = bs4.BeautifulSoup(response.text)
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0]))
        return

    global count
    global failedcount
    global totalexceptions
    global duration

    category_divs = soup.findAll('div', {'class':'aanbieding'})
    for div in category_divs:
        exceptioncount = 0
        temp_data = {}
        temp_data = models.defaultModel.copy()
        temp_data['supermarket'] = 'deka'
        temp_data['url'] = actie_page_url
        try:
            temp_data['productname'] = div.find('h2').get_text()
        except:
            LogE("[IGNORING] Productname not found","{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass

        try:
            temp_data['duration'] = ''
        except:
            LogE("[IGNORING] Duration not found","{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass

        try:
            temp_data['description'] = div.select('div.text')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Description not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        try:
            temp_data['image'] = root_url + div.find('img').get('src')
        except:
            LogE("[IGNORING] Image not found","{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass

        try:
            temp_data['amount'] = getAmount(div.find('div', {'class' : re.compile("tag")}).get('class')[1].replace('tag', ''))
        except:
            LogE("[IGNORING] Amount not found","{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass
        
        try:
            temp_data['action_price'] = div.select('span.current span.whole')[0].get_text() + div.select('span.current span.part')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Action price not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        try:
            temp_data['old_price'] = div.select('span.old span.whole')[0].get_text() + div.select('span.old span.part')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Old price not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        count = count + 1
        totalexceptions = totalexceptions + exceptioncount
        if exceptioncount > settings.maxErrors:
            LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount))
            failedcount = failedcount + 1
        else:
            db.insert(temp_data)
            LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, temp_data['productname']))
        
        if failedcount > settings.maxFailedDiscounts:
            LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
            LogI("Skipping this supermarket, too much missing info")
            LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
            return
コード例 #12
0
def fetch():
    LogI("Fetching Jan Linders discounts...")
    start_time = time.time() * 1000

    root_url = 'http://www.janlinders.nl'
    index_url = root_url + '/acties/weekacties/'

    count = 0
    failedcount = 0
    totalexceptions = 0

    try:
        response = requests.get(index_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce))
        return

    try:
        soup = bs4.BeautifulSoup(response.text, 'html5lib')
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0]))
        return

    category_divs = soup.find_all('div', class_=re.compile('dots_\d+'))

    for div in category_divs:
        div_items = div.findAll('div', { 'class' : 'hover_discount_product'})
        for actdiv in div_items:
            exceptioncount = 0
            temp_data = {}
            temp_data = models.defaultModel.copy()
            temp_data['supermarket'] = 'janlinders'
            temp_data['url'] = index_url

            if actdiv.select('div.action b'):
                temp_data['productname'] = actdiv.select('div.action b')[0].get_text() + " " + actdiv.select('div.action h4')[0].get_text().replace('\n' , ' ')
            else:
                try:
                    temp_data['productname'] = actdiv.select('div.action h4')[0].get_text().replace('\n' , ' ')
                except IndexError as e:
                    LogE("[IGNORING] Productname not found","{0}".format(e))
                    exceptioncount = exceptioncount + 1
                    pass

            try:
                temp_data['duration'] = soup.select('div.date-small')[0].get_text()
            except IndexError as e:
                LogE("[IGNORING] Duration not found","{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass

            try:
                tempOldPrice = actdiv.select('.oldprice')[0].get_text()
                if tempOldPrice is not None and tempOldPrice != '' and tempOldPrice is not '':
                    temp_data['old_price'] = tempOldPrice
            except IndexError as e:
                LogE("[IGNORING] Old price not found","{0}".format(e))
                exceptioncount = exceptioncount + 1
                pass

            try:
                tempamount = actdiv.find('div', { 'class' : re.compile("^description")})
                for e in tempamount.findAll('h4'):
                    e.extract()
                for e in tempamount.findAll('b'):
                    e.extract()
                for e in tempamount.findAll('span'):
                    e.extract()
                for e in tempamount.findAll('div'):
                    e.extract()
                temp_data['amount'] = tempamount.get_text().replace('\n' , ' ')

                try:
                    temp_data['amount'] += ". " + actdiv.select('div.x_price_w_amount span.small')[0].get_text()
                except:
                    pass
            except:
                LogE("[IGNORING] Amount not found","{0}".format(sys.exc_info()[0]))
                exceptioncount = exceptioncount + 1
                pass

            try:
                temp_data['image'] = root_url + actdiv.find('img').get('src')
            except:
                LogE("[IGNORING] Image not found","{0}".format(sys.exc_info()[0]))
                exceptioncount = exceptioncount + 1
                pass

            if actdiv.select('div.action div.regular_price span.big') and actdiv.select('div.action div.regular_price span.small'):
                try:
                    temp_data['action_price'] = actdiv.select('div.action div.regular_price span.big')[0].get_text() + "." + actdiv.select('div.action div.regular_price span.small')[0].get_text()
                except:
                    LogE("[IGNORING] Action price not found","None")
                    exceptioncount = exceptioncount + 1
                    pass
            elif actdiv.select('div.x_price_w_amount div.small') and actdiv.select('div.x_price_w_amount div.small') and actdiv.select('div.x_price_w_amount div.big'):
                try:
                    temp_data['action_price'] = actdiv.select('div.x_price_w_amount div.small')[0].get_text() + " " + actdiv.select('div.x_price_w_amount div.big')[0].get_text() + "." + actdiv.select('div.x_price_w_amount div.small')[1].get_text()
                except:
                    LogE("[IGNORING] Action price not found","None")
                    exceptioncount = exceptioncount + 1
                    pass
            elif actdiv.select('div.x_free div.big') and actdiv.select('div.x_free div.small'):
                try:
                    temp_data['action_price'] = ' '.join(actdiv.select('div.x_free div.big')[0].get_text().strip().split()) + " " + ' '.join(actdiv.select('div.x_free div.small')[0].get_text().strip().split())
                except:
                    LogE("[IGNORING] Action price not found","None")
                    exceptioncount = exceptioncount + 1
                    pass
            elif actdiv.select('div.regular_price div.big') and actdiv.select('div.regular_price div.small'):
                try:
                    temp_data['action_price'] = actdiv.select('div.regular_price div.big')[0].get_text() + "." + actdiv.select('div.regular_price div.small')[0].get_text()
                except:
                    LogE("[IGNORING] Action price not found","None")
                    exceptioncount = exceptioncount + 1
                    pass
            #elif actdiv.select('div.price div.big') and actdiv.select('div.price div.small'):
                #temp_data['action_price'] = actdiv.select('div.price div.big')[0].get_text() + "." + actdiv.select('div.price div.small')[0].get_text()
            
            elif actdiv.select('div.action div.x_discount div.big') and actdiv.select('div.action div.x_discount div.small'):
                try:
                    temp_data['action_price'] = ' '.join(actdiv.select('div.action div.x_discount div.big')[0].get_text().strip().split()) + " " + actdiv.select('div.action div.x_discount div.small')[0].get_text()
                except:
                    LogE("[IGNORING] Action price not found","None")
                    exceptioncount = exceptioncount + 1
                    pass
            elif actdiv.select('div.x_get_for div.small'):
                try:
                    temp_data['action_price'] = ' '.join(actdiv.select('div.x_get_for div.small')[0].get_text().strip().split()) + ", " + actdiv.select('div.x_get_for div.small')[1].get_text().strip()
                except:
                    LogE("[IGNORING] Action price not found","None")
                    exceptioncount = exceptioncount + 1
                    pass
            elif actdiv.select('div.x_half_price div.small'):
                try:
                    temp_data['action_price'] = ' '.join(actdiv.select('div.x_half_price div.small')[0].get_text().strip().split()) + " " + actdiv.select('div.x_half_price div.small')[1].get_text().strip()
                except:
                    LogE("[IGNORING] Action price not found","None")
                    exceptioncount = exceptioncount + 1
                    pass
            else:
                LogE("[IGNORING] Action price not found","None")
                exceptioncount = exceptioncount + 1
            totalexceptions = totalexceptions + exceptioncount

            count = count + 1
            if exceptioncount > settings.maxErrors:
                LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount))
                failedcount = failedcount + 1
            else:
                db.insert(temp_data)
                LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, temp_data['productname']))
            
            if failedcount > settings.maxFailedDiscounts:
                LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
                LogI("Skipping this supermarket, too much missing info")
                LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
                return

    seconds = (time.time() * 1000) - start_time
    LogI("Done fetching {0} Jan Linders discounts in {1}ms. {2} errors occured and ignored.\n".format(count, format(seconds, '.2f'), totalexceptions))
コード例 #13
0
ファイル: dirk.py プロジェクト: aldershoff/SupermarketScraper
def get_actie_data(actie_page_url):
    global count
    global failedcount
    global totalexceptions
    exceptioncount = 0
    actie_data = {}
    actie_data = models.defaultModel.copy()
    actie_data['supermarket'] = 'dirk'
    url = root_url + actie_page_url
    try:
        response = requests.get(url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce))
        return

    try:
        soup = bs4.BeautifulSoup(response.text)
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0]))
        return

    actie_data['url'] = root_url + actie_page_url

    try:
        actie_data['productname'] = soup.find('h2').get_text()
    except:
        LogE("[IGNORING] Productname not found",
             "{0}".format(sys.exc_info()[0]))
        exceptioncount = exceptioncount + 1
        pass

    try:
        actie_data['duration'] = soup.select(
            'div.fromTill')[0].get_text().strip()
    except IndexError as e:
        LogE("[IGNORING] Duration not found", "{0}".format(e))
        exceptioncount = exceptioncount + 1
        pass

    try:
        amount = soup.select('div.subtitle')[0].get_text().strip()
        if (amount != '' and amount != ' ' and amount is not None):
            actie_data['amount'] = amount
    except IndexError as e:
        LogE("[IGNORING] Amount not found", "{0}".format(e))
        exceptioncount = exceptioncount + 1
        pass

    try:
        div_style = soup.find('div', {'class': 'image'})['style']
        style = cssutils.parseStyle(div_style)
        url = style['background-image']
        url = url.replace('url(', '').replace(')', '')
        actie_data['image'] = root_url + url
    except:
        LogE("[IGNORING] Image not found", "{0}".format(sys.exc_info()[0]))
        exceptioncount = exceptioncount + 1
        pass

    try:
        actie_data['action_price'] = soup.select('div.star')[0].get(
            'title').strip().replace(u"\u20AC ", "").replace(",", ".")
    except IndexError as e:
        LogE("[IGNORING] Action price not found", "{0}".format(e))
        exceptioncount = exceptioncount + 1
        pass

    try:
        actie_data['old_price'] = soup.select('span.stripe')[0].get_text()
    except IndexError as e:
        LogE("[IGNORING] Old price not found", "{0}".format(e))
        exceptioncount = exceptioncount + 1
        pass

    totalexceptions = totalexceptions + exceptioncount

    count = count + 1
    if exceptioncount > settings.maxErrors:
        LogE("Too much missing info, skipping this discount",
             "{0} Errors occured".format(exceptioncount))
        failedcount = failedcount + 1
    else:
        db.insert(actie_data)
        LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count,
                                                actie_data['productname']))

    if failedcount > settings.maxFailedDiscounts:
        LogE(
            "Skipping this supermarket, too much missing info.",
            "More than {0} discounts missing too much info".format(
                settings.maxFailedDiscounts))
        LogI("Skipping this supermarket, too much missing info")
        LogI("More than {0} discounts missing too much info".format(
            settings.maxFailedDiscounts))
        return
コード例 #14
0
def get_discount_data(actie_page_url):
    global duration
    global count
    global totalexceptions
    global failedcount
    
    try:
        response = requests.get(actie_page_url, headers=settings.headers)
    except requests.exceptions.ConnectionError as ce:
        LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce))
        return

    try:
        soup = bs4.BeautifulSoup(response.text)
        soup.encode('utf-8')
    except:
        LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0]))
        return

    output = []

    for discount in soup.findAll('li', {'class': 'jum-result'}):
        exceptioncount = 0
        discount_data = {}
        discount_data = models.defaultModel.copy()
        discount_data['supermarket'] = 'jumbo'
        discount_data['url'] = index_url
        try:
            discount_data['productname'] = discount.find('h3').get_text()
        except:
            LogE("[IGNORING] Image not found","{0}".format(sys.exc_info()[0]))
            exceptioncount = exceptioncount + 1
            pass

        discount_data['duration'] = duration

        try:
            discount_data['amount'] = discount.select('dd.jum-promotion-text-field')[0].get_text()
        except IndexError as e:
            LogE("[IGNORING] Amount not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        try:
            discount_data['image'] = 'http://www.jumbo.com' + discount.select('dd.jum-item-figure img')[0].get('src')
        except IndexError as e:
            LogE("[IGNORING] Image not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        try:
            discount_data['action_price'] = discount.find(text=re.compile('Actieprijs')).replace("Actieprijs ","")
        except AttributeError as e:
            LogE("[IGNORING] Action price not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        try:
            discount_data['old_price'] = discount.find(text=re.compile('Normale prijs')).replace("Normale prijs ","")
        except AttributeError as e:
            LogE("[IGNORING] Old price not found","{0}".format(e))
            exceptioncount = exceptioncount + 1
            pass

        count = count + 1
        totalexceptions = totalexceptions + exceptioncount
        if exceptioncount > settings.maxErrors:
            LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount))
            failedcount = failedcount + 1
        else:
            db.insert(discount_data)
            LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, discount_data['productname']))
        
        if failedcount > settings.maxFailedDiscounts:
            LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
            LogI("Skipping this supermarket, too much missing info")
            LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts))
            return

    return output