Ejemplos de fpath_namer en Python, ejemplos de ers.fpath_namer en Python

Ejemplo n.º 1

0

Mostrar archivo

    # 'gin': '',#no gin
    # 'tequila': '',#no tequila
    # 'liquor': '',#no liquor
    # 'white_wine': 'https://www.aeondewine.com/shop/c/c060102/?l:inkid=aw69_avGM7kHb',
    # 'red_wine': 'https://www.aeondewine.com/shop/c/c060101/?linkid=aw69_Xl3132nk',
    # 'bourbon': '',#no bourbon
    # 'brandy': '',#no brandy
    # 'rum': '',#no rum
}

# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    print("Beginning ", ctg, url)
    if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)):
        driver.get(url)

    # If files exist, don't scrap
    perform_scrapping = not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0))
    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath) and perform_scrapping:
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        categories, products = ctg_parsing(fpath, ctg, categories, products)
        print(fpath, ctg, p, len(categories[ctg]))

        # Break or change pages
        if number_of_pdcts_in_ctg == len(categories[ctg]):
            print("Finished, because no more new products")

Ejemplo n.º 2

0

Mostrar archivo

Archivo: vicampo.py Proyecto: maker-project-1/webscrapping


urls_ctgs_dict = {
    'champagne': 'https://www.vicampo.de/weine/subart/Champagner',
    'sparkling': 'https://www.vicampo.de/weine/subart/Sekt',
    'still_wines': 'https://www.vicampo.de/weine/art/Wei%C3%9Fwein',
    'white_wine': 'https://www.vicampo.de/weine/art/Wei%C3%9Fwein',
    'red_wine': 'https://www.vicampo.de/weine/subart/Rotwein',
}


# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)):
        driver.get(url)
    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            sleep(2)
            driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            sleep(2)
            driver.waitclick('//button[@data-dismiss="modal"]', timeout=7)
            driver.save_page(fpath, scroll_to_bottom=False)
        tree = etree.parse(open(fpath, 'rb'), parser=parser)
        for li in tree.xpath('//article[@data-entity-type="product"]'):
            produrl = li.xpath('.//a[@data-ec-linklabel="Product Text"]/@href')[0]
            produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
            products[produrl] = {
                'pdct_name_on_eretailer': " ".join("".join(li.xpath('.//h1//text()')).split()),

Ejemplo n.º 3

0

Mostrar archivo

Archivo: gourmondo.py Proyecto: maker-project-1/webscrapping

from create_csvs import create_csvs
from custom_browser import CustomDriver
from ers import COLLECTION_DATE, file_hash, img_path_namer
from ers import all_keywords_fr as keywords, TEST_PAGES_FOLDER_PATH
from ers import clean_xpathd_text
from ers import fpath_namer, mh_brands, clean_url, headers
from matcher import BrandMatcher
from validators import validate_raw_files

parser = etree.HTMLParser()

# Init variables and assets
shop_id = 'gourmondo'
root_url = 'https://www.gourmondo.de'
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = 'DE'
searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True, download_images=True)


def getprice(pricestr):
    pricestr = re.sub("[^0-9.€,]", "", pricestr)
    if pricestr.endswith('*'):
        pricestr = pricestr[:-1]
    if not pricestr:
        return
    price = parse('{pound:d}€', pricestr)
    if price:
        return price.named['pound'] * 100
    price = parse('{pound:d},{pence:d}€', pricestr)

Ejemplo n.º 4

0

Mostrar archivo

from validators import validate_raw_files
from create_csvs import create_csvs
from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, headers

from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer
import shutil
from custom_browser import CustomDriver
from parse import parse
import re


# Init variables and assets
shop_id = "pogos_wine_spirits"
root_url = "https://www.pogoswine.com/" 
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = "USA"


searches, categories, products = {}, {}, {}
# If necessary
driver = CustomDriver(headless=True)


def getprice(pricestr):
    if pricestr == '':
        return pricestr
    if pricestr.count('$') >= 2:
        pricestr = "$" + pricestr.split('$')[1]
    pricestr = re.sub("[^0-9.$]", "", pricestr)
    price = parse('${pound:d}.{pence:d}', pricestr)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: bodeboca.py Proyecto: maker-project-1/webscrapping

import requests
import requests_cache, imghdr
from ers import all_keywords_es as keywords, fpath_namer, mh_brands, clean_url, headers

from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer
import shutil
from custom_browser import CustomDriver
from parse import parse
from validators import validate_raw_files
from create_csvs import create_csvs

# Init variables and assets
shop_id = "bodeboca"
root_url = "https://bodeboca.com"
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = "ES"

searches, categories, products = {}, {}, {}
# If necessary
driver = CustomDriver(headless=False, download_images=True, firefox=True)


def getprice(pricestr):
    if pricestr == '':
        return pricestr
    pricestr = re.sub("[^0-9,€]", "", pricestr)
    pricestr = pricestr.split('€')[0] + '€'
    price = parse('{pound:d},{pence:d}€', pricestr)
    if price is None:
        price = parse('{pence:d}p', pricestr)

Ejemplo n.º 6

0

Mostrar archivo

parser = etree.HTMLParser()
from urllib.parse import quote_plus
import requests
import requests_cache, imghdr

from validators import validate_raw_files
from create_csvs import create_csvs
from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, headers
from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer
import shutil
import re
# Init variables and assets
shop_id = 'astor_wines'
root_url = 'http://www.astorwines.com'
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = 'USA'
searches, categories, products = {}, {}, {}

from parse import parse


def getprice(pricestr):
    if not pricestr:
        return
    pricestr = re.sub("[^0-9.$]", "", pricestr)
    price = parse('${pound:d}.{pence:d}', pricestr)
    if not price:
        price = parse('${th:d},{pound:d}.{pence:d}', pricestr)
        return price.named['th'] * 100000 + price.named[
            'pound'] * 100 + price.named['pence']

Ejemplo n.º 7

0

Mostrar archivo

import requests_cache, imghdr

from validators import validate_raw_files
from create_csvs import create_csvs
from ers import all_keywords_de as keywords, fpath_namer, mh_brands, clean_url, headers

from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH
import shutil
from custom_browser import CustomDriver
from ers import clean_xpathd_text

# Init variables and assets
shop_id = 'real'
root_url = 'https://www.real.de'
session = requests_cache.CachedSession(fpath_namer(shop_id, 'requests_cache'))
country = 'DE'
searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True)
brm = BrandMatcher()

from parse import parse


def getprice(pricestr):
    if not pricestr:
        return
    price = parse('{pound:d}', pricestr)
    if price:
        return price.named['pound'] * 100
    price = parse('{pound:d},{pence:d}', pricestr)

Ejemplo n.º 8

0

Mostrar archivo

from validators import validate_raw_files, check_products_detection
from create_csvs import create_csvs
from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, shop_inventory_lw_csv

from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH
from custom_browser import CustomDriver
from parse import parse
from ers import clean_xpathd_text


# Init variables and assets
shop_id = 'astor_wines'
root_url = 'http://www.astorwines.com'
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = 'USA'

searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True)
brm = BrandMatcher()


def getprice(pricestr):
    if not pricestr:
        return
    pricestr = re.sub("[^0-9.$]", "", pricestr)
    price = parse('${pound:d}.{pence:d}', pricestr)
    if not price:
        price = parse('${th:d},{pound:d}.{pence:d}', pricestr)
        return price.named['th'] * 100000 + price.named['pound'] * 100 + price.named['pence']

Ejemplo n.º 9

0

Mostrar archivo

Archivo: auchan_drive.py Proyecto: maker-project-1/webscrapping

from create_csvs import create_csvs
from custom_browser import CustomDriver
from ers import COLLECTION_DATE, file_hash, img_path_namer
from ers import all_keywords_fr as keywords, TEST_PAGES_FOLDER_PATH
from ers import clean_xpathd_text
from ers import fpath_namer, mh_brands, clean_url, headers
from matcher import BrandMatcher
from validators import validate_raw_files

parser = etree.HTMLParser()


# Init variables and assets
shop_id = 'auchan_drive'
root_url = 'https://www.auchandrive.fr'
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = 'FR'
searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True, download_images=True)


def getprice(pricestr):
    if not pricestr:
        return None
    pricestr = re.sub("[^0-9,€]", "", pricestr)
    price = parse('{euro:d},{cent:d}€', pricestr)
    if price is not None:
        return price.named['euro'] * 100 + price.named['cent']


###################

Ejemplo n.º 10

0

Mostrar archivo

Archivo: goodygoody.py Proyecto: maker-project-1/webscrapping

import requests_cache, imghdr

from validators import validate_raw_files
from create_csvs import create_csvs

from ers import all_keywords_aus as keywords, fpath_namer, mh_brands, clean_url, headers
from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer
import shutil
from parse import parse
from custom_browser import CustomDriver

# Init variables and assets
shop_id = 'goodygoody'
root_url = 'https://www.goodygoody.com/'
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = 'AUS'
searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True, download_images=True)


def getprice(pricestr):
    if pricestr == '':
        return pricestr
    pricestr = pricestr.replace(',', '').strip()
    price = parse('${dol:d}.{pence:d}', pricestr)
    if price is None:
        price = parse('{pence:d}p', pricestr)
        return price.named['pence']
    else:
        return price.named['dol'] * 100 + price.named['pence']

Ejemplo n.º 11

0

Mostrar archivo

    # 'gin': '',#no gin
    # 'tequila': '',#no tequila
    # 'liquor': '',#no liquor
    # 'white_wine': 'https://www.aeondewine.com/shop/c/c060102/?l:inkid=aw69_avGM7kHb',
    # 'red_wine': 'https://www.aeondewine.com/shop/c/c060101/?linkid=aw69_Xl3132nk',
    # 'bourbon': '',#no bourbon
    # 'brandy': '',#no brandy
    # 'rum': '',#no rum
}

# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    print("Beginning ", ctg, url)
    if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)):
        driver.get(url)

    # If files exist, don't scrap
    perform_scrapping = not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0))
    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath) and perform_scrapping:
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        categories, products = ctg_parsing(fpath, ctg, categories, products)
        print(fpath, ctg, p, len(categories[ctg]))

        # Break or change pages
        if number_of_pdcts_in_ctg == len(categories[ctg]):
            print("Finished, because no more new products")

Ejemplo n.º 12

0

Mostrar archivo

    'http://www.kakuyasu.co.jp/ec/disp/CSfDispListPage_001.jsp?dispNo=001005024&q=&j=&min=&max=&ys=&yl=&yoryotanni=&allSearch=&type=01&sort=01&page={page}',
    'red_wine':
    'http://www.kakuyasu.co.jp/ec/disp/CSfDispListPage_001.jsp?dispNo=001005001&q=&j=&min=&max=&ys=&yl=&yoryotanni=&allSearch=&type=01&sort=01&page={page}',
    # 'bourbon': '',#na
    'brandy':
    'http://www.kakuyasu.co.jp/ec/disp/CSfDispListPage_001.jsp?dispNo=001014&q=&j=&min=&max=&ys=&yl=&yoryotanni=&allSearch=&type=01&sort=01&page={page}',
    # 'rum': '',#na
}

# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0

    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)

        if not op.exists(fpath):
            driver.get(url.format(page=p + 1))
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        categories, products = ctg_parsing(fpath, ctg, categories, products)

        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            print("Finishing with :", len(set(categories[ctg])), "products")
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
    print(ctg, url, p, len(categories[ctg]))

######################################

Ejemplo n.º 13

0

Mostrar archivo

import requests_cache, imghdr
from validators import validate_raw_files
from create_csvs import create_csvs
from custom_browser import CustomDriver

from ers import all_keywords_usa as keywords, mh_brands, clean_url, headers
from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer,fpath_namer
import shutil
driver = CustomDriver(headless=False, download_images=True)

# Init variables and assets
shop_id = 'tower'
root_url = 'http://buckhead.towerwinespirits.com'
session = requests_cache.core.CachedSession(
    fpath_namer(shop_id, 'requests_cache'), allowable_methods=('GET', 'POST'))
country = 'USA'
searches, categories, products = {}, {}, {}

from parse import parse


def getprice(pricestr):
    if not pricestr:
        return
    pricestr = pricestr.replace("Reg.", "").replace("\xa0", "")
    price = parse('${pound:d}.{pence:d}', pricestr)
    if not price:
        price = parse('${th:d},{pound:d}.{pence:d}', pricestr)
        return price.named['th'] * 100000 + price.named['pound'] * 100 + price.named['pence']
    return price.named['pound'] * 100 + price.named['pence']

Ejemplo n.º 14

0

Mostrar archivo

Archivo: twin_liquors.py Proyecto: maker-project-1/webscrapping

import requests_cache, imghdr

from validators import validate_raw_files
from create_csvs import create_csvs
from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, headers
from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer
import shutil
from custom_browser import CustomDriver
from parse import parse


# Init variables and assets
shop_id = 'twin_liquors'
root_url = 'http://www.twinliquors.com'
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = 'USA'
searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True, download_images=False)

urls_ctgs_dict = {
    'champagne': 'http://twinliquors.com/shop/catalogsearch/result/?q=champagne',
    'sparkling': 'http://twinliquors.com/shop/catalogsearch/result/?q=Sparkling+wine',
    'still_wines': 'http://twinliquors.com/shop/wine.html',
    'whisky': 'http://twinliquors.com/shop/catalogsearch/result/?q=whisky',
    'cognac': 'http://twinliquors.com/shop/catalogsearch/result/?q=cognac',
    'vodka': 'http://twinliquors.com/shop/catalogsearch/result/?q=vodka',
    'red_wine': 'http://twinliquors.com/shop/catalogsearch/result/?q=red+wine',
    'white_wine': 'http://twinliquors.com/shop/catalogsearch/result/?q=red+wine',
    'tequila': 'http://twinliquors.com/shop/catalogsearch/result/?q=tequila',
    'gin': 'http://twinliquors.com/shop/catalogsearch/result/?q=gin',

Ejemplo n.º 15

0

Mostrar archivo

Archivo: askul_new.py Proyecto: maker-project-1/webscrapping

    'https://lohaco.jp/g3/71-5107-5110002/?resultCount=100&page={page}',
    'red_wine':
    'https://lohaco.jp/g3/71-5107-5107002/?resultCount=100&page={page}',
    # 'bourbon': '',#na
    'brandy':
    'https://lohaco.jp/g3/71-5111-5110009/?resultCount=100&page={page}',
    'rum': 'https://lohaco.jp/g3/71-5111-5110012/?resultCount=100&page={page}',
}

# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0

    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)

        if not op.exists(fpath):
            driver.get(url.format(page=p + 1))
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        categories, products = ctg_parsing(fpath, ctg, categories, products)

        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
    print(ctg, url, p, len(categories[ctg]))

######################################
# # KW searches scrapping ############

Ejemplo n.º 16

0

Mostrar archivo

Archivo: tesco.py Proyecto: maker-project-1/webscrapping

    if 'pdct_img_main_url' in pdt and pdt[
            'pdct_img_main_url'] and brm.find_brand(
                pdt['pdct_name_on_eretailer'])['brand'] in mh_brands:
        print(pdt['pdct_name_on_eretailer'] + "." +
              pdt['pdct_img_main_url'].split('.')[-1])
        response = requests.get(pdt['pdct_img_main_url'],
                                stream=True,
                                verify=False,
                                headers=headers)
        # response.raw.decode_content = True
        tmp_file_path = '/tmp/' + shop_id + 'mhers_tmp_{}.imgtype'.format(
            abs(hash(pdt['pdct_img_main_url'])))
        img_path = img_path_namer(shop_id, pdt['pdct_name_on_eretailer'])
        with open(tmp_file_path, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
        if imghdr.what(tmp_file_path) is not None:
            img_path = img_path.split('.')[0] + '.' + imghdr.what(
                '/tmp/' + shop_id + 'mhers_tmp_{}.imgtype'.format(
                    abs(hash(pdt['pdct_img_main_url']))))
            shutil.copyfile(
                '/tmp/' + shop_id + 'mhers_tmp_{}.imgtype'.format(
                    abs(hash(pdt['pdct_img_main_url']))), img_path)
            products[url].update({
                'img_path': img_path,
                'img_hash': file_hash(img_path)
            })

create_csvs(products, categories, searches, shop_id,
            fpath_namer(shop_id, 'raw_csv'), COLLECTION_DATE)
validate_raw_files(fpath_namer(shop_id, 'raw_csv'))

Ejemplo n.º 17

0

Mostrar archivo

Archivo: rewe.py Proyecto: maker-project-1/webscrapping

from validators import validate_raw_files
from create_csvs import create_csvs
import requests_cache, imghdr

from ers import all_keywords_de as keywords, mh_brands, clean_url
from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer, fpath_namer
import requests
import shutil
from helpers.random_user_agent import randomua
from parse import parse

# Init variables and assets
shop_id = 'rewe'
root_url = 'https://shop.rewe.de'
session = requests_cache.CachedSession(fpath_namer(shop_id, 'requests_cache'))
session.headers = {'User-Agent': randomua()}
country = 'DE'
searches, categories, products = {}, {}, {}


def getprice(pricestr):
    pricestr = pricestr.replace(' ', '')
    if pricestr == '':
        return pricestr
    price = parse('{dol:d},{pence:d}€', pricestr)
    if price is None:
        price = parse('{dol:d}€', pricestr)
        return price.named['dol'] * 100
    else:
        return price.named['dol'] * 100 + price.named['pence']

Ejemplo n.º 18

0

Mostrar archivo

from create_csvs import create_csvs
from custom_browser import CustomDriver
from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH, shop_inventory_lw_csv
from ers import all_keywords_fr as keywords
from ers import clean_xpathd_text
from ers import fpath_namer, mh_brands, clean_url
from matcher import BrandMatcher
from validators import validate_raw_files, check_products_detection

parser = etree.HTMLParser()

# Init variables and assets
shop_id = 'auchan_drive'
root_url = 'https://www.auchandrive.fr'
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = 'FR'

searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True, download_images=True)
brm = BrandMatcher()


def getprice(pricestr):
    if not pricestr:
        return None
    pricestr = re.sub("[^0-9,€]", "", pricestr)
    price = parse('{euro:d},{cent:d}€', pricestr)
    if price is not None:
        return price.named['euro'] * 100 + price.named['cent']

Ejemplo n.º 19

0

Mostrar archivo

Archivo: wallys.py Proyecto: maker-project-1/webscrapping

import requests_cache, imghdr
from validators import validate_raw_files
from create_csvs import create_csvs
from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, headers

from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer
import shutil
from custom_browser import CustomDriver
from parse import parse


# Init variables and assets
shop_id = "wallys"
root_url = "http://www.wallywine.com/" 
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = "USA"


searches, categories, products = {}, {}, {}
# If necessary
driver = CustomDriver(headless=True)


def getprice(pricestr):
    if pricestr == '':
        return pricestr
    pricestr = re.sub("[^0-9.$]", "", pricestr)
    price = parse('${pound:d}.{pence:d}', pricestr)
    if price is None:
        price = parse('{pence:d}p', pricestr)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: seijoishii.py Proyecto: maker-project-1/webscrapping

    'liquor': 'https://www.seijoishii.com/c/242?&row_limit=50&page={page}',
    'white_wine':
    'https://www.seijoishii.com/c/1284?&row_limit=50&page={page}',
    'red_wine': 'https://www.seijoishii.com/c/1283?&row_limit=50&page={page}',
    'bourbon': 'https://www.seijoishii.com/c/277?&row_limit=50&page={page}',
    'brandy': 'https://www.seijoishii.com/c/239?&row_limit=50&page={page}',
    # 'rum': '',
}

# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0

    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)

        if not op.exists(fpath):
            driver.get(url.format(page=p + 1))
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        categories, products = ctg_parsing(fpath, ctg, categories, products)

        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
    print(ctg, url, p, len(categories[ctg]))

######################################
# # KW searches scrapping ############

Ejemplo n.º 21

0

Mostrar archivo

Archivo: my_bottle_shop.py Proyecto: maker-project-1/webscrapping

from create_csvs import create_csvs

from ers import all_keywords_aus as keywords, mh_brands, headers
from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer, fpath_namer
import shutil
from helpers.random_user_agent import randomua
import requests
from custom_browser import CustomDriver
from time import sleep


# Init variables and assets
shop_id = 'my_bottle_shop'
root_url = 'https://www.mybottleshop.com.au'
session = requests_cache.CachedSession(fpath_namer(shop_id, 'requests_cache'))
session.headers = {'User-Agent': randomua()}
driver = CustomDriver(headless=False, download_images=True)
with session.cache_disabled():
    session.get('https://www.mybottleshop.com.au/directory/currency/switch/currency/AUD/uenc/')
# print(session.cookies)
country = 'AUS'
searches, categories, products = {}, {}, {}
from parse import parse


def getprice(pricestr):
    if not pricestr:
        return
    price = parse('{pound:d}', pricestr)
    if price:

Ejemplo n.º 22

0

Mostrar archivo

import requests_cache, imghdr
from parse import parse
from validators import validate_raw_files
from create_csvs import create_csvs

from ers import all_keywords_uk as keywords
from ers import fpath_namer, mh_brands, clean_url, headers
from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer
import shutil
from custom_browser import CustomDriver

# Init variables and assets
shop_id = 'sainsbury'
root_url = 'https://www.sainsburys.co.uk'
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = 'UK'
searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True, download_images=False)


def getprice(pricestr):
    if pricestr == '':
        return pricestr
    pricestr = pricestr.replace(',', '').strip()
    price = parse('£{pound:d}.{pence:d}', pricestr)
    if price is None:
        price = parse('{pence:d}p', pricestr)
        return price.named['pence']
    else:
        return price.named['pound'] * 100 + price.named['pence']

Ejemplo n.º 23

0

Mostrar archivo

import requests_cache

from validators import validate_raw_files, check_products_detection
from create_csvs import create_csvs
from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, shop_inventory_lw_csv

from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH
from custom_browser import CustomDriver
from parse import parse
from ers import clean_xpathd_text

# Init variables and assets
shop_id = 'goodygoody'
root_url = 'https://www.goodygoody.com/'
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = 'AUS'

searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True)
brm = BrandMatcher()


def getprice(pricestr):
    if pricestr == '':
        return pricestr
    pricestr = pricestr.replace(',', '').strip()
    price = parse('${dol:d}.{pence:d}', pricestr)
    if price is None:
        price = parse('{pence:d}p', pricestr)
        return price.named['pence']