def baba_scrape(search): print(f'searching baba for {search} ') split_search = search.split(" ") search = search.replace(" ", "+") base_url = 'https://alibaba.com' #search and url url = f"https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={search}" #pass soup from proxyrotate soup = proxy_rotate(url) baba = {} items = [] prices = [] numbers = [] links = [] for item in soup.select('.img-switcher-parent'): text = item.get_text(strip=True).lower() #if "$" in text: if "$" in text and all(word in text for word in split_search): # get first match of regular expression price = re.search( "\$\d\d\d.\d\d|\$\d\d\d\d.\d\d|\$\d,\d\d\d.\d\d|\$\d\d\.\d\d|\$\d\.\d\d", text) no_price = re.search('$0.00', text) if price != None and no_price == None: # bs4 .find match first a tag a = item.find('a', href=True) link = base_url + a['href'] links.append(link) # .group() to convert regex object into string. price != None so that we don't run into error price = price.group() items.append(text) prices.append(price) #numbers.append(float(price.replace('$', ''))) numbers.append(float(price.replace('$', '').replace(',', ''))) average = ("{:.2f}").format(statistics.mean(numbers)) baba['id'] = 2 baba['ecommerce'] = 'baba' baba['items'] = items baba['links'] = links baba['prices'] = prices baba['numbers'] = numbers baba['average'] = average ## checck #print(baba['id']) #print(baba['ecommerce']) #print(baba['items']) #print(baba['prices']) #print(baba['numbers']) #print(baba['average']) return baba
def ebay_scrape(search): print(f'searching ebay for {search} ') split_search = search.split(" ") search = search.replace(" ", "+") base_url = 'https://ebay.com' #search and url #url = f"https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313.TR5.TRC2.A0.H0.X{search}.TRS0&_nkw={search}&_sacat=0" url = f"https://www.ebay.com/sch/i.html?_from=R40&_nkw={search}0&_sacat=0&rt=nc&LH_BIN=1" #pass soup from proxyrotate soup = proxy_rotate(url) ebay = { } items = [] prices = [] numbers = [] links = [] for item in soup.select('.s-item'): text = item.get_text(strip=True).lower() if "$" in text and all(word in text for word in split_search): # get first match of regular expression price = re.search("\$\d\d\d.\d\d|\$\d\d\d\d.\d\d|\$\d,\d\d\d.\d\d|\$\d\d\.\d\d|\$\d\.\d\d", text) no_price = re.search('$0.00',text) if price != None and no_price == None: # bs4 .find match first a tag a = item.find('a', href=True) link = a['href'] links.append(link) # .group() to convert regex object into string. price != None so that we don't run into error price = price.group() items.append(text) prices.append(price) numbers.append(float(price.replace('$', '').replace(',', ''))) average = ("{:.2f}").format(statistics.mean(numbers)) ebay['id'] = 3 ebay['ecommerce'] = 'ebay' ebay['items'] = items ebay['links'] = links ebay['prices'] = prices ebay['numbers'] = numbers ebay['average'] = average ## checck #print(ebay['id']) #print(ebay['ecommerce']) #print(ebay['items']) #print(ebay['prices']) #print(ebay['numbers']) #print(ebay['average']) return ebay
def amazon_scrape(search): print(f'searching amazon for {search} ') split_search = search.split(" ") search = search.replace(" ", "+") base_url = 'https://amazon.com' #search and url url = f"https://www.amazon.com/s?k={search}&ref=nb_sb_noss_2" #pass soup from proxyrotate soup = proxy_rotate(url) amazon = {} items = [] prices = [] numbers = [] links = [] for item in soup.findAll('div', {'class': 'sg-col-inner'}): text = item.get_text(strip=True).lower() if "$" in text: # get first match of regular expression price = re.search( "\$\d\d\d.\d\d|\$\d\d\d\d.\d\d|\$\d\d\.\d\d|\$\d\.\d\d", text) no_price = re.search('$0.00', text) if price != None and no_price == None: # bs4 .find match first a tag a = item.find('a', href=True) link = base_url + a['href'] links.append(link) # .group() to convert regex object into string. price != None so that we don't run into error price = price.group() items.append(text) prices.append(price) numbers.append(float(price.replace('$', '').replace(',', ''))) average = ("{:.2f}").format(statistics.mean(numbers)) amazon['id'] = 1 amazon['ecommerce'] = 'amazon' amazon['items'] = items amazon['links'] = links amazon['prices'] = prices amazon['numbers'] = numbers amazon['average'] = average ## checck #print(amazon['id']) #print(amazon['ecommerce']) #print(amazon['items']) #print(amazon['prices']) #print(amazon['numbers']) #print(amazon['average']) return amazon
def walmart_scrape(search): print(f'searching walmart for {search} ') split_search = search.split(" ") search = search.replace(" ", "%20") base_url = 'https://walmart.com' #search and url url = f"https://www.walmart.com/search/?query={search}" #pass soup from proxyrotate soup = proxy_rotate(url) walmart = { } items = [] prices = [] numbers = [] links = [] ## convert type: application/json soup object into string then json findings = soup.findAll('script') finding = findings[20] findingg = finding.contents[0] json_dataa = json.loads(findingg) ##debugging #position = 0 #for item in findings: #print(f'this is position : {position}') #print(item) #position += 1 data = soup.find("script", {"id": "searchContent"}) data = data.contents[0].string json_data = json.loads(data) try: for item in json_data['searchContent']['preso']['items']: items.append(item['title']) links.append(item['productPageUrl']) prices.append('$'+str(item['primaryOffer']['offerPrice'])) numbers.append(item['primaryOffer']['offerPrice']) except: print('not a good web page... breaking') #for item in soup.findAll('li',{'id':'ProductTileGridView'}): #text = item.get_text(strip=True).lower() #if "$" in text: ## get first match of regular expression #price = re.search("\$\d\d\d.\d\d|\$\d\d\d\d.\d\d|\$\d\d\.\d\d|\$\d\.\d\d", text) #no_price = re.search('$0.00',text) #if price != None and no_price == None: ## bs4 .find match first a tag #a = item.find('a', href=True) #link = base_url + a['href'] #links.append(link) ## .group() to convert regex object into string. price != None so that we don't run into error #price = price.group() #items.append(text) #prices.append(price) #numbers.append(float(price.replace('$', '').replace(',', ''))) average = ("{:.2f}").format(statistics.mean(numbers)) walmart['id'] = 5 walmart['ecommerce'] = 'walmart' walmart['items'] = items walmart['links'] = links walmart['prices'] = prices walmart['numbers'] = numbers walmart['average'] = average # checck #print(walmart['id']) #print(walmart['ecommerce']) #print(walmart['items']) #print(walmart['prices']) #print(walmart['numbers']) #print(walmart['average']) return walmart