def get_images(boxes_array, info_tuple, test=False):
    images = [None] * len(boxes_array)

    b = 0
    for box in boxes_array:
        #Remember that boxes are arrays
        searcher = search_boxes(box, info_tuple)
        if searcher:
            images[b] = searcher[0].img.get('src')

        b += 1
    if test == True:
        print(images)
    return images
def get_products_urls(boxes_array,
                      Page,
                      test_all=False,
                      test_len=False,
                      position=None):
    urls = [None] * len(boxes_array)
    url = Page.url_get
    '''If you know want to know some info of an specific product by its position on the page.
    Like you know the position of the cheapest'''
    if position:
        searcher = search_boxes(boxes_array[position], Page.product_urls)
        if searcher:
            urls = searcher[0].a.get(url)

    else:
        '''For Testing the functions and Xpaths'''
        b = 0
        for box in boxes_array:
            #Remember that boxes are arrays
            searcher = search_boxes(box, Page.product_urls)
            if searcher:
                if test_all == True:
                    print(searcher[0].get(url))
            if Page.__name__ == 'Amazon':
                source_url = searcher[0].get(url)
                urls[b] = 'https://www.amazon.com.mx' + source_url

            if Page.__name__ == 'Ebay' or Page.__name__ == 'Mercado Libre':
                urls[b] = searcher[0].a.get(url)
            b += 1

    if test_all == True:
        print(urls)

    elif test_len == True:
        print('urls:', len(urls))
    return urls
Example #3
0
def get_products_urls(boxes_array, info_tuple, test=False):
    urls = [None] * len(boxes_array)

    b = 0
    for box in boxes_array:
        #Remember that boxes are arrays
        searcher = search_boxes(box, info_tuple)
        if searcher:
            source_url = searcher[0].get('href')
            urls[b] = 'https://www.amazon.com.mx' + source_url

        b += 1
    if test == True:
        print(urls)
    return urls
Example #4
0
    def test_products_info_getters(self):
        user_request = 'audifonos inalambricos'
        country = 'mx'
        ebay_url = Ebay.adapt_url(Ebay, user_request, country)
        ebay_soup = extract_soup(ebay_url, 1, just_soup=True)
        ebay_boxes = search_boxes(ebay_soup, Ebay.boxes)

        getters = {
            'ebay_names': len(get_names(ebay_boxes, Ebay.name_and_images)),
            'ebay_images': len(get_images(ebay_boxes, Ebay)),
            'ebay_urls': len(get_products_urls(ebay_boxes, Ebay)),
            # 'ebay_price' : len(get_price(country, ebay_boxes, Ebay.price)),
        }

        for value in getters:
            self.assertEqual(len(ebay_boxes), getters[value])
def get_stars(boxes_array, Page, country='mx', test_all=False, test_len=False, position=None):
    stars = [None]*len(boxes_array)
    decimal_sep = Page.money_dict[country]['decimal']

    b=0
    for box in boxes_array:
        #Remember that boxes are arrays
        searcher = search_boxes(box, Page.stars)
        if searcher:
            if decimal_sep == '.':
                stars[b] = float(searcher[0].get_text()[:3])
            else:
                stars[b] = float(searcher[0].get_text()[:3].replace(decimal_sep,''))

        b +=1
    return stars
Example #6
0
 def test_products_info_getters(self):
     user_request = 'audifonos inalambricos'
     country = 'mx'
     for Page in self.Pages:
         page_url = Page.adapt_url(Page, user_request, country)
         page_soup = extract_soup(page_url, 1, just_soup=True)
         page_boxes = search_boxes(page_soup, Page.boxes)
         #New test
         page_names = len(get_names(page_boxes, Page))
         page_images = len(get_images(page_boxes, Page))
         page_urls = len(get_products_urls(page_boxes, Page))
         page_price = len(get_price(country, page_boxes, Page))
         
         trials = [page_names, page_images, page_urls, page_price]
         for test in trials:
             self.assertEqual(len(page_boxes), test)
Example #7
0
    def test_cheapest_gets_info(self):
        user_request = 'audifonos inalambricos'
        country = 'mx'

        ebay_url = Ebay.adapt_url(Ebay, user_request, country)
        ebay_soup = extract_soup(ebay_url, 1, just_soup=True)
        ebay_boxes = search_boxes(ebay_soup, Ebay.boxes)
        ebay_prices = get_price(country, ebay_boxes, Ebay.price)

        ebay_cheapest_idx, ebay_cheapest_price = cheapest(
            ebay_prices, position_and_price=True)
        cheapest_ebay_product_1 = get_cheapest(ebay_cheapest_idx, ebay_boxes,
                                               ebay_cheapest_price, country,
                                               Ebay)

        for value in cheapest_ebay_product_1:
            self.assertIsNotNone(cheapest_ebay_product_1[value])
Example #8
0
    def test_cheapest_gets_info(self):
        user_request = 'audifonos inalambricos'
        country = 'mx'

        ml_url = Mercado_Libre.adapt_url(Mercado_Libre, user_request, country)
        ml_soup = extract_soup(ml_url, 1, just_soup=True)
        ml_boxes = search_boxes(ml_soup, Mercado_Libre.boxes)
        meli_prices = get_price(country, ml_boxes, Mercado_Libre.price)

        meli_cheapest_idx, meli_cheapest_price = cheapest(
            meli_prices, position_and_price=True)
        cheapest_ml_product_1 = get_cheapest(meli_cheapest_idx, ml_boxes,
                                             meli_cheapest_price, country,
                                             Mercado_Libre)

        for value in cheapest_ml_product_1:
            self.assertIsNotNone(cheapest_ml_product_1[value])
Example #9
0
    def test_products_info_getters(self):
        user_request = 'audifonos inalambricos'
        country = 'mx'
        ml_url = Mercado_Libre.adapt_url(Mercado_Libre, user_request, country)
        ml_soup = extract_soup(ml_url, 1, just_soup=True)
        ml_boxes = search_boxes(ml_soup, Mercado_Libre.boxes)

        getters = {
            'ml_names': len(get_names(ml_boxes,
                                      Mercado_Libre.name_and_images)),
            'ml_images': len(get_images(ml_boxes, Mercado_Libre)),
            'ml_urls': len(get_products_urls(ml_boxes, Mercado_Libre)),
            'ml_price': len(get_price(country, ml_boxes, Mercado_Libre.price)),
        }

        for value in getters:
            self.assertEqual(len(ml_boxes), getters[value])
Example #10
0
def get_stars(country, boxes_array, info_tuple, test=False):
    stars = [None] * len(boxes_array)

    b = 0
    for box in boxes_array:
        #Remember that boxes are arrays
        searcher = search_boxes(box, info_tuple)

        if searcher:
            if country == 'mx':
                stars[b] = float(searcher[0].get_text()[:3])
            if country == 'br':
                stars[b] = float(searcher[0].get_text()[:3].replace(',', '.'))

        b += 1
    if test == True:
        print(stars)
    return stars
Example #11
0
    def test_products_info_getters(self):
        user_request = 'audifonos inalambricos'
        country = 'mx'
        amazon_url = Amazon.adapt_url(Amazon, country, user_request)
        amazon_soup = extract_soup(amazon_url, 1, just_soup=True)
        amazon_boxes = search_boxes(amazon_soup, Amazon.boxes)

        amazon_names = len(get_names(amazon_boxes, Amazon.name_and_images))
        amazon_images = len(get_images(amazon_boxes, Amazon.name_and_images))
        amazon_urls = len(get_products_urls(amazon_boxes, Amazon.product_urls))
        amazon_price = len(get_price(country, amazon_boxes, Amazon.price))
        amazon_ids = len(amazon_products_id(amazon_boxes))
        amazon_reviews = len(get_reviews(country, amazon_boxes, Amazon.reviews))
        amazon_stars = len(get_stars(country, amazon_boxes, Amazon.stars))

        trials = [amazon_names, amazon_images, amazon_urls, amazon_price, amazon_ids, amazon_reviews, amazon_stars]
        for test in trials:
            self.assertEquals(len(amazon_boxes), test)
def get_reviews(boxes_array, Page, country='mx', test_all=False, test_len=False, position=None):
    reviews = [None]*len(boxes_array)
    comma_sep = Page.money_dict[country]['thousands']

    b=0
    for box in boxes_array:
        #Remember that boxes are arrays
        searcher = search_boxes(box, Page.reviews)
        if searcher:
            if len(searcher) > 1:
                searcher = [searcher[0]]

            try:
                reviews[b] = int(searcher[0].get_text().replace(comma_sep,''))
            except:
                pass
        
        b +=1
    return reviews
Example #13
0
def scraper(Page, user_request, country):
    #Adapt the url
    url = Page.adapt_url(Page, country, user_request)

    #All the HTML of the page
    soup = extract_soup(url, 1, just_soup=True)

    # #HTML divided by products, and stored as elements of an array
    boxes = search_boxes(soup, Page.boxes)

    # From this part, could get better AFTER the 4 scrapers are made
    #From the Boxes, obtain the prices
    prices = get_price(country, boxes, Page.price)

    #Obtain the cheapest from prices and then, you obtain the cheapest product as a dictionary
    cheapest_idx, cheapest_price = cheapest(prices, position_and_price=True)
    cheapest_product_dictionary = get_cheapest(cheapest_idx, boxes,
                                               cheapest_price, country, Page)

    return cheapest_product_dictionary
Example #14
0
def scraper(user_request, country):
    #Adapt the url
    ml_url = Mercado_Libre.adapt_url(Mercado_Libre, country, user_request)

    #All the HTML of the page
    ml_soup = extract_soup(ml_url, 1, just_soup=True)

    # #HTML divided by products, and stored as elements of an array
    ml_boxes = search_boxes(ml_soup, Mercado_Libre.boxes)

    # From this part, could get better AFTER the 4 scrapers are made
    #From the Boxes, obtain the prices
    meli_prices = get_price(country, ml_boxes, Mercado_Libre.price)

    #Obtain the cheapest from prices and then, you obtain the cheapest product as a dictionary
    meli_cheapest_idx, meli_cheapest_price = cheapest(meli_prices,
                                                      position_and_price=True)
    cheapest_ml_product_dictionary = get_cheapest(meli_cheapest_idx, ml_boxes,
                                                  meli_cheapest_price, country,
                                                  Mercado_Libre)

    return cheapest_ml_product_dictionary
Example #15
0
def request_products(user_request, Page, header, home=False, country='mx'):
    page_url = Page.adapt_url(Page, user_request, country)

    # All the HTML of the page
    page_soup, status = extract_soup(page_url, header)
    # Wait until receive the info or been denied
    if status == 503:
        while status == 503:
            time.sleep(1)
            page_soup, status = extract_soup(page_url)
    elif status == 200:
        # HTML divided by products, and stored as elements of an array
        page_boxes = search_boxes(page_soup, Page.boxes)
        page_products = {}

        # Obtain the info of the product
        page_products['names'] = get_names(page_boxes, Page)
        page_products['images'] = get_images(page_boxes, Page)
        page_products['urls'] = get_products_urls(page_boxes, Page)
        page_products['prices'] = get_price(country, page_boxes, Page, home)
        page_products['status'] = status

        return page_products

    else:
        page_products = {}
        # With the empty values, not None, the script knows that this won't be
        # uploaded. In case of one 'None', it thinks that there was a product box
        # without info. Somethings that occurs in Amazon
        page_products['store'] = Page.name
        page_products['idx'] = Page.index
        page_products['product'] = user_request
        page_products['names'] = []
        page_products['images'] = []
        page_products['urls'] = []
        page_products['prices'] = []
        page_products['status'] = status
        
        return page_products
Example #16
0
def scraper(user_request, country):
    #Adapt the url
    amazon_url = Amazon.adapt_url(Amazon, user_request, country)

    #All the HTML of the page
    amazon_soup = extract_soup(amazon_url, 1, just_soup=True)

    # #HTML divided by products, and stored as elements of an array
    amazon_boxes = search_boxes(amazon_soup, Amazon.boxes)

    # From this part, could get better AFTER the 4 scrapers are made
    #From the Boxes, obtain the prices
    amazon_prices = get_price(country, amazon_boxes, Amazon.price)

    #Obtain the cheapest from prices and then, you obtain the cheapest product as a dictionary
    amazon_cheapest_idx, amazon_cheapest_price = cheapest(
        amazon_prices, position_and_price=True)
    cheapest_amazon_product_dictionary = get_cheapest(amazon_cheapest_idx,
                                                      amazon_boxes,
                                                      amazon_cheapest_price,
                                                      country, Amazon)

    return cheapest_amazon_product_dictionary
Example #17
0
                                                      country, Amazon)

    return cheapest_amazon_product_dictionary


if __name__ == "__main__":

    user_request = 'audifonos inalambricos'
    country = 'mx'
    amazon_url = Amazon.adapt_url(Amazon, user_request, country)

    #All the HTML of the page
    amazon_soup = extract_soup(amazon_url, 1, just_soup=True)

    #HTML divided by products, and stored as elements of an array
    amazon_boxes = search_boxes(amazon_soup, Amazon.boxes)
    amazon_products = {}

    amazon_products['name'] = get_names(amazon_boxes, Amazon.name_and_images)
    '''Amazon's images source (link)'''
    amazon_products['image'] = get_images(amazon_boxes, Amazon)

    amazon_products['url'] = get_products_urls(amazon_boxes, Amazon)
    '''Just Amazon's products id. Is used as a url generator:
    amazon's url + domain + "/dp/" + product_id'''
    # amazon_products['id']= amazon_products_id(amazon_boxes)
    '''Just stars as float'''
    amazon_products['star'] = get_stars(country, amazon_boxes, Amazon.stars)
    '''Just number of reviews as int'''
    amazon_products['review'] = get_reviews(country, amazon_boxes,
                                            Amazon.reviews)
Example #18
0
def get_price(country,
              boxes_array,
              Page,
              test_all=False,
              test_len=False,
              position=None):
    price = [None] * len(boxes_array)
    coin_symbol = Page.money_dict[country]['coin']
    k_sep = Page.money_dict[country]['thousands']
    d_sep = Page.money_dict[country]['decimal']
    tps = Page.money_dict[country]['two_prices_sep']
    price_string = 'start'
    '''If you know want to know some info of an specific product by its position on the page.
    Like you know the position of the cheapest'''
    if position:
        searcher = search_boxes(boxes_array[position], Page.price)
        if searcher:
            try:
                price_string = searcher[0].get_text().split(tps)
                price_string = price_string[0].replace(
                    coin_symbol, '').replace(k_sep, '').replace(d_sep, '.')
                #Special case
                price_string = re.findall(r'(\d+\.\d+)', price_string)
                if Page.name != 'Ebay':
                    price = round(float(price_string[0]) / 22, 2)
                else:
                    price = float(price_string[0])
            except:
                error_message = f'''String index out of range. 
                Money dictionary: {Page.money_dict}
                Original String: {searcher[0].get_text()}
                Box #{position}'''
                raise ValueError(error_message)

    #For Testing the functions and Xpaths
    else:
        b = 0
        for box in boxes_array:
            #Remember that boxes are arrays
            searcher = search_boxes(box, Page.price)
            if searcher:
                if country == 'mx':
                    # try:
                    price_string = searcher[0].get_text().split(tps)
                    price_string_bfre = price_string[0].replace(
                        coin_symbol, '').replace(k_sep,
                                                 '').replace(d_sep, '.')
                    #Ebays Special case
                    price_string_bfre = price_string_bfre.replace(u'\xa0', u'')
                    #Just in case
                    price_string_check = re.findall(rf"(\d+\.?\d+)",
                                                    price_string_bfre)
                    #Sometimes it needs to be done again
                    #
                    if Page.name != 'Ebay':
                        price[b] = round(float(price_string_check[0]) / 22, 2)
                    else:
                        if str(
                                type(price_string_check)
                        ) == "<class 'list'>" and len(price_string_check) > 0:
                            string_search = price_string_check[0]
                        else:
                            string_search = price_string_check
                        if str(type(string_search)) == "<class 'str'>":
                            price[b] = float(string_search)
            b += 1

    return price
from scrape_data import Mercado_Libre, Products
from scrape_funcs import extract_soup, search_boxes, get_brute_info
from data_filters import get_names, get_images, get_products_urls, get_price

from general_funcs import cheapest, get_cheapest

user_request = 'audifonos inalambricos'
country = 'mx'
ml_url = Mercado_Libre.adapt_url(Mercado_Libre, country, user_request)

#All the HTML of the page
ml_soup = extract_soup(ml_url, 1, just_soup=True)

#HTML divided by products, and stored as elements of an array
ml_boxes = search_boxes(ml_soup, Mercado_Libre.boxes)
ml_products = {}

ml_products['names'] = get_names(ml_boxes, Mercado_Libre.name_and_images)

#Mercado_Libre's images source (link)
ml_products['images'] = get_images(ml_boxes, Mercado_Libre.name_and_images)

ml_products['urls'] = get_products_urls(ml_boxes, Mercado_Libre.product_urls)

ml_products['prices'] = get_price(country, ml_boxes, Mercado_Libre.price)

cheapest = cheapest(ml_products['prices'])
cheapest_ml_product = get_cheapest(cheapest, ml_products)
for key in cheapest_ml_product:
    print(key, ':', cheapest_ml_product[key])
Example #20
0
    cheapest_product_dictionary = get_cheapest(cheapest_idx, boxes,
                                               cheapest_price, country, Page)

    return cheapest_product_dictionary


if __name__ == "__main__":
    user_request = 'audifonos inalambricos'
    country = 'mx'
    ebay_url = Ebay.adapt_url(Ebay, user_request, country)

    #All the HTML of the page
    ebay_soup = extract_soup(ebay_url, 1, just_soup=True)

    # #HTML divided by products, and stored as elements of an array
    ebay_boxes = search_boxes(ebay_soup, Ebay.boxes)
    # print(ebay_boxes)

    ebay_products = {}

    ebay_products['names'] = get_names(ebay_boxes, Ebay.name_and_images)
    # #Ebay's images source (link)
    ebay_products['images'] = get_images(ebay_boxes, Ebay)

    ebay_products['urls'] = get_products_urls(ebay_boxes, Ebay)
    ebay_products['prices'] = get_price(country, ebay_boxes, Ebay.price)

    cheapest_idx = cheapest(ebay_products['prices'])
    cheapest_ebay_product2 = get_cheapest(cheapest_idx, ebay_products)

    print(f'\nTest ONE:')
def get_price(country,
              boxes_array,
              info_tuple,
              test_all=False,
              test_len=False,
              position=None):
    price = [None] * len(boxes_array)
    coin_symbol = money_dict[country]['coin']
    k_sep = money_dict[country]['thousands']
    d_sep = money_dict[country]['decimal']
    tps = money_dict[country]['two_prices_sep']
    price_string = 'start'
    '''If you know want to know some info of an specific product by its position on the page.
    Like you know the position of the cheapest'''
    if position:
        searcher = search_boxes(boxes_array[position], info_tuple)
        if searcher:
            try:
                price_string = searcher[0].get_text().split(tps)
                price_string = price_string[0].replace(
                    coin_symbol, '').replace(k_sep, '').replace(d_sep, '.')
                #Special case
                price_string = price_string.replace('\xa0', '')
                price = float(price_string)
            except:
                error_message = f'''String index out of range. 
                Money dictionary: {money_dict}
                Original String: {searcher[0].get_text()}
                Box #{position}'''
                raise ValueError(error_message)

    #For Testing the functions and Xpaths
    else:
        b = 0
        for box in boxes_array:
            #Remember that boxes are arrays
            searcher = search_boxes(box, info_tuple)
            if test_all == True:
                print(searcher)
            if searcher:
                if country == 'mx':
                    try:
                        price_string = searcher[0].get_text().split(tps)
                        if test_all == True:
                            print(price_string)
                        price_string = price_string[0].replace(
                            coin_symbol, '').replace(k_sep,
                                                     '').replace(d_sep, '.')
                        if test_all == True:
                            print(price_string)
                        #Ebays Special case
                        price_string = price_string.replace('\xa0', '')
                        price[b] = float(price_string)
                    except:
                        error_message = f'''Info about the Value. 
                        Money dictionary: {money_dict}
                        Original String: {searcher[0].get_text()}
                        Price string: {price_string}
                        Box #{b}'''
                        raise ValueError(error_message)
            b += 1

    if test_all == True:
        print('prices:', len(price), price)

    elif test_len == True:
        print('prices:', len(price))

    return price