def get_prices(url): global db_connection global scrape_date global shop_id page = requests.get(url) current_page = 1 soup = BeautifulSoup(page.text) last_page = soup.find_all(class_='last')[-1] pages = last_page.find('a')['href'] match = re.search(r'\d+$', pages) if match is None: number_of_pages = 1 else: number_of_pages = int(match.group()) print number_of_pages with open('csv/evomag/evomag-{0}.csv'.format(time.strftime("%d-%m-%y")), 'ab') as csv_file: evomag_db = csv.writer(csv_file) while current_page <= number_of_pages: soup = BeautifulSoup(page.text) produse = soup.find_all(class_='prod_list_det') for produs in produse: name = get_name(produs.find_next("a")['title']) # titlu price = get_price( produs.find(class_='discount_span').previous_sibling) link = "{0}{1}".format(SITE_URL, produs.find('a')['href']) imagine = "{0}{1}".format(SITE_URL, produs.find('img')['src']) availability = get_stoc( produs.find(class_='stoc_produs').find_next('span').text) entry = (name, price, availability, link, imagine) produs = (name, price, link, imagine) db_connection.insert_product(produs, shop_id, scrape_date) evomag_db.writerow(entry) current_page += 1 page_url = "{0}{1}{2}".format(url, "Filtru/Pagina:", current_page) time.sleep(DELAY) page = base.get_page_content(page_url) while not page: if current_page > number_of_pages: break current_page += 1 time.sleep(DELAY) page_url = "{0}{1}{2}".format(url, "Filtru/Pagina:", current_page) page = base.get_page_content(page_url)
def get_prices(url): global db_connection global scrape_date global shop_id page = requests.get(url) current_page = 1 soup = BeautifulSoup(page.text) last_page = soup.find_all(class_='last')[-1] pages = last_page.find('a')['href'] match = re.search(r'\d+$', pages) if match is None: number_of_pages = 1 else: number_of_pages = int(match.group()) print number_of_pages with open('csv/evomag/evomag-{0}.csv'.format( time.strftime("%d-%m-%y")), 'ab') as csv_file: evomag_db = csv.writer(csv_file) while current_page <= number_of_pages: soup = BeautifulSoup(page.text) produse = soup.find_all(class_='prod_list_det') for produs in produse: name = get_name(produs.find_next("a")['title']) # titlu price = get_price(produs.find(class_='discount_span'). previous_sibling) link = "{0}{1}".format(SITE_URL, produs.find('a')['href']) imagine = "{0}{1}".format(SITE_URL, produs.find('img')['src']) availability = get_stoc(produs.find (class_='stoc_produs'). find_next('span').text) entry = (name, price, availability, link, imagine) produs = (name, price, link, imagine) db_connection.insert_product(produs, shop_id, scrape_date) evomag_db.writerow(entry) current_page += 1 page_url = "{0}{1}{2}".format(url, "Filtru/Pagina:", current_page) time.sleep(DELAY) page = base.get_page_content(page_url) while not page: if current_page > number_of_pages: break current_page += 1 time.sleep(DELAY) page_url = "{0}{1}{2}".format(url, "Filtru/Pagina:", current_page) page = base.get_page_content(page_url)
def get_prices(url): global db_connection global shop_id global scrape_date current_page = 0 page = requests.get(url) soup = BeautifulSoup(page.text) number_of_pages = get_number_of_pages(soup.find(class_='left-part').text) with open('csv/emag/emag-{0}.csv'.format(time.strftime("%d-%m-%y")), 'ab') as csv_file: emag_db = csv.writer(csv_file) while current_page <= number_of_pages: soup = BeautifulSoup(page.text) products = soup.find_all(attrs={"name": "product[]"}) for product in products: name = get_name(product.find_next("a")['title']) price = str( product.find_next(class_='money-int').text.replace( '.', '')) availability = get_availability( product.find_next(class_='stare-disp-listing').text) link = "http://www.emag.ro{0}".format( product.find_next("a")['href']) image = product.find_next('img')['src'] entry = (name, price, availability, link, image) produs = (name, price, link, image) db_connection.insert_product(produs, shop_id, scrape_date) emag_db.writerow(entry) current_page += 1 time.sleep(DELAY) page_url = "{0}p{1}/c".format(url[:-1], current_page) page = base.get_page_content(page_url) while not page: if current_page > number_of_pages: break current_page += 1 time.sleep(DELAY) page_url = "{0}p{1}/c".format(url[:-1], current_page) page = base.get_page_content(page_url) print current_page + 1
def get_prices( url="http://www.electrofun.ro/aparate-pentru-bucatarie/masini-de-tocat/" ): global db_connection global scrape_date global shop_id page = requests.get(url, headers=headers) current_page = 1 soup = BeautifulSoup(page.text) if not soup.find(class_='x-pages'): return number_of_pages = get_number_of_pages( soup.find(class_='x-pages').find_all('a')[-1]['href']) with open('csv/electrofun/electrofun-{0}.csv'.format(DATE), 'ab') as csv_file: pcfun_db = csv.writer(csv_file) while current_page <= number_of_pages: soup = BeautifulSoup(page.text) products = soup.find_all(class_='x-product-line') for product in products: name = get_name(product.find_next('a').text) link = product.find_next('a')['href'] imagine = product.find_next('img')['src'] availability = str(product.find_next('strong').text) price = get_price(str(product.find(class_='price_tag').text)) entry = (name, price, availability, link, imagine) produs = (name, price, link, imagine) db_connection.insert_product(produs, shop_id, scrape_date) pcfun_db.writerow(entry) current_page += 1 page_url = "{0}pagina{1}/".format(url, current_page) page = base.get_page_content(page_url) time.sleep(DELAY) print current_page while not page: if current_page > number_of_pages: break current_page += 1 time.sleep(DELAY) page_url = "{0}pagina{1}/".format(url, current_page) page = base.get_page_content(page_url)
def get_prices(url): global db_connection global shop_id global scrape_date current_page = 0 page = requests.get(url) soup = BeautifulSoup(page.text) number_of_pages = get_number_of_pages(soup.find(class_='left-part').text) with open('csv/emag/emag-{0}.csv'.format( time.strftime("%d-%m-%y")), 'ab') as csv_file: emag_db = csv.writer(csv_file) while current_page <= number_of_pages: soup = BeautifulSoup(page.text) products = soup.find_all(attrs={"name": "product[]"}) for product in products: name = get_name(product.find_next("a")['title']) price = str(product.find_next(class_='money-int'). text.replace('.', '')) availability = get_availability( product.find_next(class_='stare-disp-listing').text) link = "http://www.emag.ro{0}".format( product.find_next("a")['href']) image = product.find_next('img')['src'] entry = (name, price, availability, link, image) produs = (name, price, link, image) db_connection.insert_product(produs, shop_id, scrape_date) emag_db.writerow(entry) current_page += 1 time.sleep(DELAY) page_url = "{0}p{1}/c".format(url[:-1], current_page) page = base.get_page_content(page_url) while not page: if current_page > number_of_pages: break current_page += 1 time.sleep(DELAY) page_url = "{0}p{1}/c".format(url[:-1], current_page) page = base.get_page_content(page_url) print current_page + 1
def get_prices(url="http://www.electrofun.ro/aparate-pentru-bucatarie/masini-de-tocat/"): global db_connection global scrape_date global shop_id page = requests.get(url, headers=headers) current_page = 1 soup = BeautifulSoup(page.text) if not soup.find(class_="x-pages"): return number_of_pages = get_number_of_pages(soup.find(class_="x-pages").find_all("a")[-1]["href"]) with open("csv/electrofun/electrofun-{0}.csv".format(DATE), "ab") as csv_file: pcfun_db = csv.writer(csv_file) while current_page <= number_of_pages: soup = BeautifulSoup(page.text) products = soup.find_all(class_="x-product-line") for product in products: name = get_name(product.find_next("a").text) link = product.find_next("a")["href"] imagine = product.find_next("img")["src"] availability = str(product.find_next("strong").text) price = get_price(str(product.find(class_="price_tag").text)) entry = (name, price, availability, link, imagine) produs = (name, price, link, imagine) db_connection.insert_product(produs, shop_id, scrape_date) pcfun_db.writerow(entry) current_page += 1 page_url = "{0}pagina{1}/".format(url, current_page) page = base.get_page_content(page_url) time.sleep(DELAY) print current_page while not page: if current_page > number_of_pages: break current_page += 1 time.sleep(DELAY) page_url = "{0}pagina{1}/".format(url, current_page) page = base.get_page_content(page_url)
def get_prices(url="http://www.pcfun.ro/ultrabook/"): global db_connection global shop_id global scrape_date page = requests.get(url, headers=headers) current_page = 1 soup = BeautifulSoup(page.text) number_of_pages = get_number_of_pages(soup.find(class_='x-pages-more'). find_previous('a')['href']) with open('csv/pcfun/pcfun-{0}.csv'.format(DATE), 'ab') as csv_file: pcfun_db = csv.writer(csv_file) while current_page <= number_of_pages: soup = BeautifulSoup(page.text) products = soup.find_all(class_='x-product-line') for product in products: name = get_name(product.find_next('a').text) link = product.find_next('a')['href'] imagine = product.find_next('img')['src'] availability = str(product.find_next('strong').text) price = get_price(str(product.find_next(class_='price_tag').text)) entry = (name, price, availability, link, imagine) produs = (name, price, link, imagine) db_connection.insert_product(produs, shop_id, scrape_date) pcfun_db.writerow(entry) current_page += 1 page_url = "{0}pagina{1}/".format(url, current_page) page = base.get_page_content(page_url) time.sleep(DELAY) print current_page while not page: if current_page > number_of_pages: break current_page += 1 time.sleep(DELAY) page_url = "{0}pagina{1}/".format(url, current_page) page = base.get_page_content(page_url)