Esempio n. 1
0
def baba_scrape(search):
    print(f'searching baba for {search} ')
    split_search = search.split(" ")
    search = search.replace(" ", "+")
    base_url = 'https://alibaba.com'

    #search and url
    url = f"https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={search}"

    #pass soup from proxyrotate
    soup = proxy_rotate(url)

    baba = {}

    items = []
    prices = []
    numbers = []
    links = []

    for item in soup.select('.img-switcher-parent'):
        text = item.get_text(strip=True).lower()
        #if "$" in text:
        if "$" in text and all(word in text for word in split_search):
            # get first match of regular expression
            price = re.search(
                "\$\d\d\d.\d\d|\$\d\d\d\d.\d\d|\$\d,\d\d\d.\d\d|\$\d\d\.\d\d|\$\d\.\d\d",
                text)
            no_price = re.search('$0.00', text)
            if price != None and no_price == None:
                # bs4 .find match first a tag
                a = item.find('a', href=True)
                link = base_url + a['href']
                links.append(link)
                # .group() to convert regex object into string. price != None so that we don't run into error
                price = price.group()
                items.append(text)
                prices.append(price)
                #numbers.append(float(price.replace('$', '')))
                numbers.append(float(price.replace('$', '').replace(',', '')))

    average = ("{:.2f}").format(statistics.mean(numbers))

    baba['id'] = 2
    baba['ecommerce'] = 'baba'
    baba['items'] = items
    baba['links'] = links
    baba['prices'] = prices
    baba['numbers'] = numbers
    baba['average'] = average

    ## checck
    #print(baba['id'])
    #print(baba['ecommerce'])
    #print(baba['items'])
    #print(baba['prices'])
    #print(baba['numbers'])
    #print(baba['average'])

    return baba
Esempio n. 2
0
def ebay_scrape(search):
    print(f'searching ebay for {search} ')
    split_search = search.split(" ")
    search = search.replace(" ", "+")
    base_url = 'https://ebay.com'

    #search and url
    #url = f"https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313.TR5.TRC2.A0.H0.X{search}.TRS0&_nkw={search}&_sacat=0"
    url =  f"https://www.ebay.com/sch/i.html?_from=R40&_nkw={search}0&_sacat=0&rt=nc&LH_BIN=1" 
    #pass soup from proxyrotate
    soup = proxy_rotate(url)

    ebay = {

    }

    items = []
    prices = []
    numbers = []
    links = []
    for item in soup.select('.s-item'):
        text = item.get_text(strip=True).lower()
        if "$" in text and all(word in text for word in split_search):
            # get first match of regular expression
            price = re.search("\$\d\d\d.\d\d|\$\d\d\d\d.\d\d|\$\d,\d\d\d.\d\d|\$\d\d\.\d\d|\$\d\.\d\d", text)
            no_price = re.search('$0.00',text)
            if price != None and no_price == None:
                # bs4 .find match first a tag 
                a = item.find('a', href=True)
                link = a['href']
                links.append(link)
                # .group() to convert regex object into string. price != None so that we don't run into error  
                price = price.group()
                items.append(text)
                prices.append(price)
                numbers.append(float(price.replace('$', '').replace(',', '')))
    
    average = ("{:.2f}").format(statistics.mean(numbers))
    
    ebay['id'] = 3
    ebay['ecommerce'] = 'ebay'
    ebay['items'] = items
    ebay['links'] = links
    ebay['prices'] = prices
    ebay['numbers'] = numbers 
    ebay['average'] = average

    ## checck
    #print(ebay['id'])
    #print(ebay['ecommerce'])
    #print(ebay['items'])
    #print(ebay['prices'])
    #print(ebay['numbers'])
    #print(ebay['average'])

    return ebay
Esempio n. 3
0
def amazon_scrape(search):
    print(f'searching amazon for {search} ')
    split_search = search.split(" ")
    search = search.replace(" ", "+")
    base_url = 'https://amazon.com'

    #search and url
    url = f"https://www.amazon.com/s?k={search}&ref=nb_sb_noss_2"

    #pass soup from proxyrotate
    soup = proxy_rotate(url)

    amazon = {}

    items = []
    prices = []
    numbers = []
    links = []
    for item in soup.findAll('div', {'class': 'sg-col-inner'}):
        text = item.get_text(strip=True).lower()
        if "$" in text:
            # get first match of regular expression
            price = re.search(
                "\$\d\d\d.\d\d|\$\d\d\d\d.\d\d|\$\d\d\.\d\d|\$\d\.\d\d", text)
            no_price = re.search('$0.00', text)
            if price != None and no_price == None:
                # bs4 .find match first a tag
                a = item.find('a', href=True)
                link = base_url + a['href']
                links.append(link)
                # .group() to convert regex object into string. price != None so that we don't run into error
                price = price.group()
                items.append(text)
                prices.append(price)
                numbers.append(float(price.replace('$', '').replace(',', '')))

    average = ("{:.2f}").format(statistics.mean(numbers))

    amazon['id'] = 1
    amazon['ecommerce'] = 'amazon'
    amazon['items'] = items
    amazon['links'] = links
    amazon['prices'] = prices
    amazon['numbers'] = numbers
    amazon['average'] = average

    ## checck
    #print(amazon['id'])
    #print(amazon['ecommerce'])
    #print(amazon['items'])
    #print(amazon['prices'])
    #print(amazon['numbers'])
    #print(amazon['average'])

    return amazon
Esempio n. 4
0
def walmart_scrape(search):
    print(f'searching walmart for {search} ')
    split_search = search.split(" ")
    search = search.replace(" ", "%20")
    base_url = 'https://walmart.com'

    #search and url
    url = f"https://www.walmart.com/search/?query={search}"

    #pass soup from proxyrotate
    soup = proxy_rotate(url)

    walmart = {

    }

    items = []
    prices = []
    numbers = []
    links = []
    ## convert type: application/json soup object into string then json 
    findings = soup.findAll('script')
    finding = findings[20]
    findingg = finding.contents[0]
    json_dataa = json.loads(findingg)
    ##debugging
    #position = 0 
    #for item in findings:
        #print(f'this is position : {position}')
        #print(item)
        #position += 1
    
    data = soup.find("script", {"id": "searchContent"})
    data = data.contents[0].string
    json_data = json.loads(data)
    try:
        for item in json_data['searchContent']['preso']['items']:
            items.append(item['title'])
            links.append(item['productPageUrl'])
            prices.append('$'+str(item['primaryOffer']['offerPrice']))
            numbers.append(item['primaryOffer']['offerPrice'])
    except:
        print('not a good web page... breaking')

    
    
    #for item in soup.findAll('li',{'id':'ProductTileGridView'}):
        #text = item.get_text(strip=True).lower()
        #if "$" in text:
            ## get first match of regular expression
            #price = re.search("\$\d\d\d.\d\d|\$\d\d\d\d.\d\d|\$\d\d\.\d\d|\$\d\.\d\d", text)
            #no_price = re.search('$0.00',text)
            #if price != None and no_price == None:
                ## bs4 .find match first a tag 
                #a = item.find('a', href=True)
                #link = base_url + a['href']
                #links.append(link)
                ## .group() to convert regex object into string. price != None so that we don't run into error  
                #price = price.group()
                #items.append(text)
                #prices.append(price)
                #numbers.append(float(price.replace('$', '').replace(',', '')))
    

    average = ("{:.2f}").format(statistics.mean(numbers))
    
    walmart['id'] = 5
    walmart['ecommerce'] = 'walmart'
    walmart['items'] = items
    walmart['links'] = links
    walmart['prices'] = prices
    walmart['numbers'] = numbers 
    walmart['average'] = average

    # checck
    #print(walmart['id'])
    #print(walmart['ecommerce'])
    #print(walmart['items'])
    #print(walmart['prices'])
    #print(walmart['numbers'])
    #print(walmart['average'])

    return walmart