def get_images(boxes_array, info_tuple, test=False): images = [None] * len(boxes_array) b = 0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, info_tuple) if searcher: images[b] = searcher[0].img.get('src') b += 1 if test == True: print(images) return images
def get_products_urls(boxes_array, Page, test_all=False, test_len=False, position=None): urls = [None] * len(boxes_array) url = Page.url_get '''If you know want to know some info of an specific product by its position on the page. Like you know the position of the cheapest''' if position: searcher = search_boxes(boxes_array[position], Page.product_urls) if searcher: urls = searcher[0].a.get(url) else: '''For Testing the functions and Xpaths''' b = 0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, Page.product_urls) if searcher: if test_all == True: print(searcher[0].get(url)) if Page.__name__ == 'Amazon': source_url = searcher[0].get(url) urls[b] = 'https://www.amazon.com.mx' + source_url if Page.__name__ == 'Ebay' or Page.__name__ == 'Mercado Libre': urls[b] = searcher[0].a.get(url) b += 1 if test_all == True: print(urls) elif test_len == True: print('urls:', len(urls)) return urls
def get_products_urls(boxes_array, info_tuple, test=False): urls = [None] * len(boxes_array) b = 0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, info_tuple) if searcher: source_url = searcher[0].get('href') urls[b] = 'https://www.amazon.com.mx' + source_url b += 1 if test == True: print(urls) return urls
def test_products_info_getters(self): user_request = 'audifonos inalambricos' country = 'mx' ebay_url = Ebay.adapt_url(Ebay, user_request, country) ebay_soup = extract_soup(ebay_url, 1, just_soup=True) ebay_boxes = search_boxes(ebay_soup, Ebay.boxes) getters = { 'ebay_names': len(get_names(ebay_boxes, Ebay.name_and_images)), 'ebay_images': len(get_images(ebay_boxes, Ebay)), 'ebay_urls': len(get_products_urls(ebay_boxes, Ebay)), # 'ebay_price' : len(get_price(country, ebay_boxes, Ebay.price)), } for value in getters: self.assertEqual(len(ebay_boxes), getters[value])
def get_stars(boxes_array, Page, country='mx', test_all=False, test_len=False, position=None): stars = [None]*len(boxes_array) decimal_sep = Page.money_dict[country]['decimal'] b=0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, Page.stars) if searcher: if decimal_sep == '.': stars[b] = float(searcher[0].get_text()[:3]) else: stars[b] = float(searcher[0].get_text()[:3].replace(decimal_sep,'')) b +=1 return stars
def test_products_info_getters(self): user_request = 'audifonos inalambricos' country = 'mx' for Page in self.Pages: page_url = Page.adapt_url(Page, user_request, country) page_soup = extract_soup(page_url, 1, just_soup=True) page_boxes = search_boxes(page_soup, Page.boxes) #New test page_names = len(get_names(page_boxes, Page)) page_images = len(get_images(page_boxes, Page)) page_urls = len(get_products_urls(page_boxes, Page)) page_price = len(get_price(country, page_boxes, Page)) trials = [page_names, page_images, page_urls, page_price] for test in trials: self.assertEqual(len(page_boxes), test)
def test_cheapest_gets_info(self): user_request = 'audifonos inalambricos' country = 'mx' ebay_url = Ebay.adapt_url(Ebay, user_request, country) ebay_soup = extract_soup(ebay_url, 1, just_soup=True) ebay_boxes = search_boxes(ebay_soup, Ebay.boxes) ebay_prices = get_price(country, ebay_boxes, Ebay.price) ebay_cheapest_idx, ebay_cheapest_price = cheapest( ebay_prices, position_and_price=True) cheapest_ebay_product_1 = get_cheapest(ebay_cheapest_idx, ebay_boxes, ebay_cheapest_price, country, Ebay) for value in cheapest_ebay_product_1: self.assertIsNotNone(cheapest_ebay_product_1[value])
def test_cheapest_gets_info(self): user_request = 'audifonos inalambricos' country = 'mx' ml_url = Mercado_Libre.adapt_url(Mercado_Libre, user_request, country) ml_soup = extract_soup(ml_url, 1, just_soup=True) ml_boxes = search_boxes(ml_soup, Mercado_Libre.boxes) meli_prices = get_price(country, ml_boxes, Mercado_Libre.price) meli_cheapest_idx, meli_cheapest_price = cheapest( meli_prices, position_and_price=True) cheapest_ml_product_1 = get_cheapest(meli_cheapest_idx, ml_boxes, meli_cheapest_price, country, Mercado_Libre) for value in cheapest_ml_product_1: self.assertIsNotNone(cheapest_ml_product_1[value])
def test_products_info_getters(self): user_request = 'audifonos inalambricos' country = 'mx' ml_url = Mercado_Libre.adapt_url(Mercado_Libre, user_request, country) ml_soup = extract_soup(ml_url, 1, just_soup=True) ml_boxes = search_boxes(ml_soup, Mercado_Libre.boxes) getters = { 'ml_names': len(get_names(ml_boxes, Mercado_Libre.name_and_images)), 'ml_images': len(get_images(ml_boxes, Mercado_Libre)), 'ml_urls': len(get_products_urls(ml_boxes, Mercado_Libre)), 'ml_price': len(get_price(country, ml_boxes, Mercado_Libre.price)), } for value in getters: self.assertEqual(len(ml_boxes), getters[value])
def get_stars(country, boxes_array, info_tuple, test=False): stars = [None] * len(boxes_array) b = 0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, info_tuple) if searcher: if country == 'mx': stars[b] = float(searcher[0].get_text()[:3]) if country == 'br': stars[b] = float(searcher[0].get_text()[:3].replace(',', '.')) b += 1 if test == True: print(stars) return stars
def test_products_info_getters(self): user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, country, user_request) amazon_soup = extract_soup(amazon_url, 1, just_soup=True) amazon_boxes = search_boxes(amazon_soup, Amazon.boxes) amazon_names = len(get_names(amazon_boxes, Amazon.name_and_images)) amazon_images = len(get_images(amazon_boxes, Amazon.name_and_images)) amazon_urls = len(get_products_urls(amazon_boxes, Amazon.product_urls)) amazon_price = len(get_price(country, amazon_boxes, Amazon.price)) amazon_ids = len(amazon_products_id(amazon_boxes)) amazon_reviews = len(get_reviews(country, amazon_boxes, Amazon.reviews)) amazon_stars = len(get_stars(country, amazon_boxes, Amazon.stars)) trials = [amazon_names, amazon_images, amazon_urls, amazon_price, amazon_ids, amazon_reviews, amazon_stars] for test in trials: self.assertEquals(len(amazon_boxes), test)
def get_reviews(boxes_array, Page, country='mx', test_all=False, test_len=False, position=None): reviews = [None]*len(boxes_array) comma_sep = Page.money_dict[country]['thousands'] b=0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, Page.reviews) if searcher: if len(searcher) > 1: searcher = [searcher[0]] try: reviews[b] = int(searcher[0].get_text().replace(comma_sep,'')) except: pass b +=1 return reviews
def scraper(Page, user_request, country): #Adapt the url url = Page.adapt_url(Page, country, user_request) #All the HTML of the page soup = extract_soup(url, 1, just_soup=True) # #HTML divided by products, and stored as elements of an array boxes = search_boxes(soup, Page.boxes) # From this part, could get better AFTER the 4 scrapers are made #From the Boxes, obtain the prices prices = get_price(country, boxes, Page.price) #Obtain the cheapest from prices and then, you obtain the cheapest product as a dictionary cheapest_idx, cheapest_price = cheapest(prices, position_and_price=True) cheapest_product_dictionary = get_cheapest(cheapest_idx, boxes, cheapest_price, country, Page) return cheapest_product_dictionary
def scraper(user_request, country): #Adapt the url ml_url = Mercado_Libre.adapt_url(Mercado_Libre, country, user_request) #All the HTML of the page ml_soup = extract_soup(ml_url, 1, just_soup=True) # #HTML divided by products, and stored as elements of an array ml_boxes = search_boxes(ml_soup, Mercado_Libre.boxes) # From this part, could get better AFTER the 4 scrapers are made #From the Boxes, obtain the prices meli_prices = get_price(country, ml_boxes, Mercado_Libre.price) #Obtain the cheapest from prices and then, you obtain the cheapest product as a dictionary meli_cheapest_idx, meli_cheapest_price = cheapest(meli_prices, position_and_price=True) cheapest_ml_product_dictionary = get_cheapest(meli_cheapest_idx, ml_boxes, meli_cheapest_price, country, Mercado_Libre) return cheapest_ml_product_dictionary
def request_products(user_request, Page, header, home=False, country='mx'): page_url = Page.adapt_url(Page, user_request, country) # All the HTML of the page page_soup, status = extract_soup(page_url, header) # Wait until receive the info or been denied if status == 503: while status == 503: time.sleep(1) page_soup, status = extract_soup(page_url) elif status == 200: # HTML divided by products, and stored as elements of an array page_boxes = search_boxes(page_soup, Page.boxes) page_products = {} # Obtain the info of the product page_products['names'] = get_names(page_boxes, Page) page_products['images'] = get_images(page_boxes, Page) page_products['urls'] = get_products_urls(page_boxes, Page) page_products['prices'] = get_price(country, page_boxes, Page, home) page_products['status'] = status return page_products else: page_products = {} # With the empty values, not None, the script knows that this won't be # uploaded. In case of one 'None', it thinks that there was a product box # without info. Somethings that occurs in Amazon page_products['store'] = Page.name page_products['idx'] = Page.index page_products['product'] = user_request page_products['names'] = [] page_products['images'] = [] page_products['urls'] = [] page_products['prices'] = [] page_products['status'] = status return page_products
def scraper(user_request, country): #Adapt the url amazon_url = Amazon.adapt_url(Amazon, user_request, country) #All the HTML of the page amazon_soup = extract_soup(amazon_url, 1, just_soup=True) # #HTML divided by products, and stored as elements of an array amazon_boxes = search_boxes(amazon_soup, Amazon.boxes) # From this part, could get better AFTER the 4 scrapers are made #From the Boxes, obtain the prices amazon_prices = get_price(country, amazon_boxes, Amazon.price) #Obtain the cheapest from prices and then, you obtain the cheapest product as a dictionary amazon_cheapest_idx, amazon_cheapest_price = cheapest( amazon_prices, position_and_price=True) cheapest_amazon_product_dictionary = get_cheapest(amazon_cheapest_idx, amazon_boxes, amazon_cheapest_price, country, Amazon) return cheapest_amazon_product_dictionary
country, Amazon) return cheapest_amazon_product_dictionary if __name__ == "__main__": user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, user_request, country) #All the HTML of the page amazon_soup = extract_soup(amazon_url, 1, just_soup=True) #HTML divided by products, and stored as elements of an array amazon_boxes = search_boxes(amazon_soup, Amazon.boxes) amazon_products = {} amazon_products['name'] = get_names(amazon_boxes, Amazon.name_and_images) '''Amazon's images source (link)''' amazon_products['image'] = get_images(amazon_boxes, Amazon) amazon_products['url'] = get_products_urls(amazon_boxes, Amazon) '''Just Amazon's products id. Is used as a url generator: amazon's url + domain + "/dp/" + product_id''' # amazon_products['id']= amazon_products_id(amazon_boxes) '''Just stars as float''' amazon_products['star'] = get_stars(country, amazon_boxes, Amazon.stars) '''Just number of reviews as int''' amazon_products['review'] = get_reviews(country, amazon_boxes, Amazon.reviews)
def get_price(country, boxes_array, Page, test_all=False, test_len=False, position=None): price = [None] * len(boxes_array) coin_symbol = Page.money_dict[country]['coin'] k_sep = Page.money_dict[country]['thousands'] d_sep = Page.money_dict[country]['decimal'] tps = Page.money_dict[country]['two_prices_sep'] price_string = 'start' '''If you know want to know some info of an specific product by its position on the page. Like you know the position of the cheapest''' if position: searcher = search_boxes(boxes_array[position], Page.price) if searcher: try: price_string = searcher[0].get_text().split(tps) price_string = price_string[0].replace( coin_symbol, '').replace(k_sep, '').replace(d_sep, '.') #Special case price_string = re.findall(r'(\d+\.\d+)', price_string) if Page.name != 'Ebay': price = round(float(price_string[0]) / 22, 2) else: price = float(price_string[0]) except: error_message = f'''String index out of range. Money dictionary: {Page.money_dict} Original String: {searcher[0].get_text()} Box #{position}''' raise ValueError(error_message) #For Testing the functions and Xpaths else: b = 0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, Page.price) if searcher: if country == 'mx': # try: price_string = searcher[0].get_text().split(tps) price_string_bfre = price_string[0].replace( coin_symbol, '').replace(k_sep, '').replace(d_sep, '.') #Ebays Special case price_string_bfre = price_string_bfre.replace(u'\xa0', u'') #Just in case price_string_check = re.findall(rf"(\d+\.?\d+)", price_string_bfre) #Sometimes it needs to be done again # if Page.name != 'Ebay': price[b] = round(float(price_string_check[0]) / 22, 2) else: if str( type(price_string_check) ) == "<class 'list'>" and len(price_string_check) > 0: string_search = price_string_check[0] else: string_search = price_string_check if str(type(string_search)) == "<class 'str'>": price[b] = float(string_search) b += 1 return price
from scrape_data import Mercado_Libre, Products from scrape_funcs import extract_soup, search_boxes, get_brute_info from data_filters import get_names, get_images, get_products_urls, get_price from general_funcs import cheapest, get_cheapest user_request = 'audifonos inalambricos' country = 'mx' ml_url = Mercado_Libre.adapt_url(Mercado_Libre, country, user_request) #All the HTML of the page ml_soup = extract_soup(ml_url, 1, just_soup=True) #HTML divided by products, and stored as elements of an array ml_boxes = search_boxes(ml_soup, Mercado_Libre.boxes) ml_products = {} ml_products['names'] = get_names(ml_boxes, Mercado_Libre.name_and_images) #Mercado_Libre's images source (link) ml_products['images'] = get_images(ml_boxes, Mercado_Libre.name_and_images) ml_products['urls'] = get_products_urls(ml_boxes, Mercado_Libre.product_urls) ml_products['prices'] = get_price(country, ml_boxes, Mercado_Libre.price) cheapest = cheapest(ml_products['prices']) cheapest_ml_product = get_cheapest(cheapest, ml_products) for key in cheapest_ml_product: print(key, ':', cheapest_ml_product[key])
cheapest_product_dictionary = get_cheapest(cheapest_idx, boxes, cheapest_price, country, Page) return cheapest_product_dictionary if __name__ == "__main__": user_request = 'audifonos inalambricos' country = 'mx' ebay_url = Ebay.adapt_url(Ebay, user_request, country) #All the HTML of the page ebay_soup = extract_soup(ebay_url, 1, just_soup=True) # #HTML divided by products, and stored as elements of an array ebay_boxes = search_boxes(ebay_soup, Ebay.boxes) # print(ebay_boxes) ebay_products = {} ebay_products['names'] = get_names(ebay_boxes, Ebay.name_and_images) # #Ebay's images source (link) ebay_products['images'] = get_images(ebay_boxes, Ebay) ebay_products['urls'] = get_products_urls(ebay_boxes, Ebay) ebay_products['prices'] = get_price(country, ebay_boxes, Ebay.price) cheapest_idx = cheapest(ebay_products['prices']) cheapest_ebay_product2 = get_cheapest(cheapest_idx, ebay_products) print(f'\nTest ONE:')
def get_price(country, boxes_array, info_tuple, test_all=False, test_len=False, position=None): price = [None] * len(boxes_array) coin_symbol = money_dict[country]['coin'] k_sep = money_dict[country]['thousands'] d_sep = money_dict[country]['decimal'] tps = money_dict[country]['two_prices_sep'] price_string = 'start' '''If you know want to know some info of an specific product by its position on the page. Like you know the position of the cheapest''' if position: searcher = search_boxes(boxes_array[position], info_tuple) if searcher: try: price_string = searcher[0].get_text().split(tps) price_string = price_string[0].replace( coin_symbol, '').replace(k_sep, '').replace(d_sep, '.') #Special case price_string = price_string.replace('\xa0', '') price = float(price_string) except: error_message = f'''String index out of range. Money dictionary: {money_dict} Original String: {searcher[0].get_text()} Box #{position}''' raise ValueError(error_message) #For Testing the functions and Xpaths else: b = 0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, info_tuple) if test_all == True: print(searcher) if searcher: if country == 'mx': try: price_string = searcher[0].get_text().split(tps) if test_all == True: print(price_string) price_string = price_string[0].replace( coin_symbol, '').replace(k_sep, '').replace(d_sep, '.') if test_all == True: print(price_string) #Ebays Special case price_string = price_string.replace('\xa0', '') price[b] = float(price_string) except: error_message = f'''Info about the Value. Money dictionary: {money_dict} Original String: {searcher[0].get_text()} Price string: {price_string} Box #{b}''' raise ValueError(error_message) b += 1 if test_all == True: print('prices:', len(price), price) elif test_len == True: print('prices:', len(price)) return price