def WalmartScrape(product): WalmartURL = "https://www.walmart.com/search/?query=" WalmartID = "product-title-link line-clamp line-clamp-2 truncate-title" file = open('WMurls', 'w') results = scrapePrep(WalmartURL, WalmartID, product) file.write(str(results)) file.close file = open('WMurls', 'r') lines = file.readlines() file.close() file = open('WMurls', 'w') for line in lines: line = line.strip() if line.find( '<a class="product-title-link line-clamp line-clamp-2 truncate-title" data-type="itemTitles" href="' ) != -1: line = line.replace( '</a>, <a class="product-title-link line-clamp line-clamp-2 truncate-title" data-type="itemTitles" href="', 'https://www.walmart.com/') line = line.replace( '[<a class="product-title-link line-clamp line-clamp-2 truncate-title" data-type="itemTitles" href="', 'https://www.walmart.com/') line = line.replace('" lang="en" tabindex="-1">', '') file.write(line + "\n") file.close file = open('WMurls', 'r') lines = file.readlines() file.close() file = open('WMproducts', 'w') for line in lines: line = line.strip() productSoup = pageURL(line) title = productSoup.find( class_="prod-ProductTitle font-normal").get_text() price = productSoup.find( class_="price-characteristic").get_text() + '.' + productSoup.find( class_="price-mantissa").get_text() price = price.replace('\n', '') price = price.replace(' ', '') price = price.replace(',', '') rating = productSoup.find(class_="seo-avg-rating").get_text() site = line #print(title.strip()) #print(price.strip()) #print(rating.strip() + " out of 5 stars") file.write(title.strip() + '@') file.write(price.strip() + '@') file.write(rating.strip() + '@') file.write(site + '\n') file.close()
def BHscrape(product): BHurl = 'https://www.bhphotovideo.com/c/search?Ntt=' BHID = 'more_3AHFX0SPjGtK8ii-kIXIau' file = open('BHurls', 'w') results = scrapePrep(BHurl, BHID, product) file.write(str(results)) file.close() file = open('BHurls', 'r') lines = file.readlines() file.close() file = open('BHurls', 'w') for line in lines: line = line.strip() if line.find('<a class="more_3AHFX0SPjGtK8ii-kIXIau"') != -1: line = line.replace( '</a>, <a class="more_3AHFX0SPjGtK8ii-kIXIau" data-selenium="miniProductPageSellingPointsDetailsLink" href="', 'https://www.bhphotovideo.com') line = line.replace( '[<a class="more_3AHFX0SPjGtK8ii-kIXIau" data-selenium="miniProductPageSellingPointsDetailsLink" href="', 'https://www.bhphotovideo.com') line = line.replace('">', '') file.write(line + '\n') file.close() file = open('BHurls', 'r') lines = file.readlines() file.close() file = open('BHproducts', 'w') for line in lines: line = line.strip() productSoup = pageURL(line) title = productSoup.find( class_="title_3bJZzlB3PKkE_8ajs9mroe").get_text() price = productSoup.find( class_="price_1DPoToKrLP8uWvruGqgtaY").get_text() price = price.replace('$', '') price = price.replace(',', '') #rating = productSoup.find(class_="starContainer_K6YXTdn52hu9mzQ1MV43G") #print(title.strip()) #print(price.strip()) #print(rating) file.write(title.strip() + '@') file.write(price.strip() + '\n')
def BestBuyScrape(product): BestBuyURL = "https://www.bestbuy.com/site/searchpage.jsp?st=" BestBuyID = "image-link" file = open('BBurls', 'w') results = scrapePrep(BestBuyURL, BestBuyID, product) file.write(str(results)) file.close file = open('BBurls', 'r') lines = file.readlines() file.close() file = open("BBurls", "w") #looking for specific product URLS to find info needed for line in lines: line = line.strip() if line.find('</a>, <a class="image-link" href=') != -1: line = line.replace('</a>, <a class="image-link" href="', 'https://www.bestbuy.com') line = line.replace('">', '') file.write(line + "\n") file.close() file = open('BBurls', 'r') lines = file.readlines() file.close() file = open('BBproducts', 'w') for line in lines: line = line.strip() productSoup = pageURL(line) title = productSoup.find(class_="heading-5 v-fw-regular").get_text() price = productSoup.find( 'div', attrs={ 'class': 'priceView-hero-price priceView-customer-price' }).get_text() price = price.replace(' ', '') price = price.replace('Yourpriceforthisitemis$', '') price = price.replace('\n', '') priceln = int((len(price)) / 2) + 1 price = price[1:priceln] price = price.replace(',', '') rating = productSoup.find('p', class_="sr-only").get_text() if rating.find('Be the first') != -1: rating = "0" if rating.find('Not yet reviewed') != -1: rating = '0' rating = rating.replace('Rating, ', '') rating = rating.replace('out', ' ') rating = rating.strip() rating = rating[0:3] site = line file.write(title.strip() + '@') file.write(price.strip() + '@') file.write(rating.strip() + '@') file.write(site + '\n')
import requests from bs4 import BeautifulSoup from functions import pageURL, itemInput, URLinput, pageInfo amazonURL = "https://www.amazon.com/s?k=" amazonID = "a-link-normal a-text-normal" file = open("WebURLs", "w") amazonURL = itemInput(amazonURL) amazonPage = pageURL(amazonURL) amazonSoup = URLinput(amazonPage) print (amazonURL) results = pageInfo(amazonSoup, amazonID) file.write(str(results)) file.close() file = open("WebURLs", "r") lines = file.readlines() file.close() file = open("WebURLs", "w") #looking for correct product URLs for line in lines: line = line.strip() if line.find( "href=")!= -1: if line.find("/gp/slredirect/picassoRedirect.html") == -1: if line.find("http") == -1: line = line.replace('[<a class="a-link-normal a-text-normal" href="', 'https://www.amazon.com') line = line.replace('</a>, <a class="a-link-normal a-text-normal" href="', 'https://www.amazon.com')
import requests from bs4 import BeautifulSoup from functions import pageURL, itemInput, URLinput, pageInfo NewEggURL = "https://www.newegg.com/p/pl?d=" NewEggID = "item-title" file = open('NEurls', 'w') NewEggURL = itemInput(NewEggURL) NewEggPage = pageURL(NewEggURL) NewEggSoup = URLinput(NewEggPage) print(NewEggURL) results = pageInfo(NewEggSoup, NewEggID) file.write(str(results)) file.close file = open('NEurls', 'r') lines = file.readlines() file.close() file = open('NEurls', 'w') for line in lines: line = line.strip() if line.find('<a class="item-title" href=') != -1: line = line.replace('</a>, <a class="item-title" href="', '') line = line.replace('[<a class="item-title" href="', '') line = line.replace('">', '') file.write(line + '\n') file.close