def get_reviews(country, boxes_array, info_tuple, test=False): reviews = [None] * len(boxes_array) b = 0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, info_tuple) if searcher: if len(searcher) > 1: searcher = [searcher[0]] if country == 'mx': try: reviews[b] = int(searcher[0].get_text().replace(',', '')) except: pass elif country == 'br': try: reviews[b] = int(searcher[0].get_text().replace('.', '')) except: pass b += 1 if test == True: print(reviews) return reviews
def get_price(country, boxes_array, info_tuple, test=False): price = [None] * len(boxes_array) coin_symbol = coins_dict[country] b = 0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, info_tuple) if searcher: if country == 'mx': try: price[b] = float( searcher[0].get_text()[coin_symbol:].replace(',', '')) except: pass elif country == 'br': try: price[b] = float( searcher[0].get_text()[coin_symbol:].replace( '.', '').replace(',', '.')) except: pass b += 1 if test == True: print(price) return price
def test_get_brute_info_including_Nones(self): user_request = 'audifonos inalambricos' country = 'mx' amz_url = Amazon.adapt_url(Amazon, country, user_request) amz_soup = extract_soup(amz_url, 1, just_soup=True) #New test amz_boxes = search_boxes(amz_soup, Amazon.boxes) self.assertEqual(len(amz_boxes), 60)
def test_get_brute_info_without_losses(self): user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, country, user_request) amazon_soup = extract_soup(amazon_url, 1, just_soup=True) amazon_boxes = search_boxes(amazon_soup, Amazon.boxes) #New test amazon_string_stars = get_brute_info(amazon_boxes, Amazon.stars) self.assertEqual(len(amazon_boxes), len(amazon_string_stars))
def get_images(boxes_array, info_tuple, test=False): images = [None] * len(boxes_array) b = 0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, info_tuple) if searcher: images[b] = searcher[0].img.get('src') b += 1 if test == True: print(images) return images
def get_products_urls(boxes_array, info_tuple, test=False): urls = [None] * len(boxes_array) b = 0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, info_tuple) if searcher: source_url = searcher[0].get('href') urls[b] = 'https://www.amazon.com.mx' + source_url b += 1 if test == True: print(urls) return urls
def get_stars(country, boxes_array, info_tuple, test=False): stars = [None] * len(boxes_array) b = 0 for box in boxes_array: #Remember that boxes are arrays searcher = search_boxes(box, info_tuple) if searcher: if country == 'mx': stars[b] = float(searcher[0].get_text()[:3]) if country == 'br': stars[b] = float(searcher[0].get_text()[:3].replace(',', '.')) b += 1 if test == True: print(stars) return stars
def test_products_info_getters(self): user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, country, user_request) amazon_soup = extract_soup(amazon_url, 1, just_soup=True) amazon_boxes = search_boxes(amazon_soup, Amazon.boxes) amazon_names = len(get_names(amazon_boxes, Amazon.name_and_images)) amazon_images = len(get_images(amazon_boxes, Amazon.name_and_images)) amazon_urls = len(get_products_urls(amazon_boxes, Amazon.product_urls)) amazon_price = len(get_price(country, amazon_boxes, Amazon.price)) amazon_ids = len(amazon_products_id(amazon_boxes)) amazon_reviews = len(get_reviews(country, amazon_boxes, Amazon.reviews)) amazon_stars = len(get_stars(country, amazon_boxes, Amazon.stars)) trials = [ amazon_names, amazon_images, amazon_urls, amazon_price, amazon_ids, amazon_reviews, amazon_stars ] for test in trials: self.assertEquals(len(amazon_boxes), test)
# sys.path.insert(1, '"web scraper"/Amazon') from Amazon.data_filters import get_names, get_images, get_products_urls, get_price from Amazon.data_filters import get_stars, get_reviews, amazon_products_id from bs4 import BeautifulSoup user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, country, user_request) #All the HTML of the page amazon_soup = extract_soup(amazon_url, 1, just_soup=True) #HTML divided by products, and stored as elements of an array amazon_boxes = search_boxes(amazon_soup, Amazon.boxes) amazon_products = {} amazon_products['name'] = get_names(amazon_boxes, Amazon.name_and_images) '''Amazon's images source (link)''' amazon_products['image'] = get_images(amazon_boxes, Amazon.name_and_images) amazon_products['url'] = get_products_urls(amazon_boxes, Amazon.product_urls) '''Just Amazon's products id. Is used as a url generator: amazon's url + domain + "/dp/" + product_id''' amazon_products['id']= amazon_products_id(amazon_boxes) '''Just stars as float''' amazon_products['star'] = get_stars(country, amazon_boxes, Amazon.stars)