def scrapper(urlToRequest, firstPage, lastPage, fileName, openMode, newlineDelimiter): global pagesScrapped #Opening a csv file to write data in it with open(fileName, openMode, newline=newlineDelimiter) as csv_file: writer = csv.writer(csv_file) #For all pages in starting from the first to the last page for page in range(firstPage, lastPage): uClient = urlReq(urlToRequest + "page/" + str(page)) page_html = uClient.read() uClient.close() #Initializing the page soup to scrape data page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("h2", {"class": "title"}) #Showing the progress as well as opening a separate page for a game to get RAM as well as Size for container in containers: pagesScrapped += 1 print("\rScrapped pages: " + str(pagesScrapped), end='') uClientGamePage = urlReq( urlToRequest + "-".join(container.a.text.split(" ")).lower().replace( "'", "").replace('"', '').replace("’", "")) game_page_html = uClientGamePage.read() uClientGamePage.close() #All data that is to be scrapped try: game_page_soup = soup(game_page_html, "html.parser") ramReq = game_page_soup.find(text=re.compile("^RAM")) setupSize = game_page_soup.find( text=re.compile("^Setup Size")) cpuReq = game_page_soup.find(text=re.compile("^CPU")) OSReq = game_page_soup.find( text=re.compile("^Operating System")) genres = game_page_soup.findAll("li", {"class": "active-parent"}) genre_string = "" for genre in genres: genre_string += genre.a.text + " " if setupSize is None: setupSize = "Setup Size: NA" if ramReq is None: ramReq = "RAM: NA" writer.writerow([ " ".join(container.a.text.split(" ")[:-2]), ramReq.split(":")[1], setupSize.split(":")[1], cpuReq.split(":")[1], OSReq.split(":")[1], genre_string ]) except AttributeError: print("\nValue error in game: " + " ".join(container.a.text.split(" ")[:-2]))
def tellWeather(): try: speak(weatherDict[random.randint(1, len(weatherDict))]) uClient = urlReq(weather_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") tempValue = page_soup.find("div", {"class": "today_nowcard-temp"}) currentTemperature = tempValue.span.text sideTable = page_soup.find( "div", { "class": "today_nowcard-sidecar component panel" }).table.tbody.tr.td.span wind = sideTable.text windDirection = wind.split()[0] windSpeed = wind.split()[1] + wind.split()[2] speak("It's " + currentTemperature + "C currently with a wind speed of " + windSpeed + " in the direction " + windDirectionTable[windDirection]) except urllib.error.HTTPError: speak( "It seems the weather server is ignoring me. Can you try this later?" )
def allrecipe_search(ingredients): recipeDict = {} # Search a list of recipes on all recipe.com searchUrl = "https://www.allrecipes.com/search/results/?wt=" + ingredients + "&sort=re" recipeUrlList = urlReq(searchUrl) htmlRaw = recipeUrlList.read() recipeUrlList.close() soup = BeautifulSoup(htmlRaw, "html.parser") # Get a list of recipes from main search result page recipeBook = [] recipeUrlList = [] recipeBlockContainer = soup.findAll("article", {"class": "fixed-recipe-card"}) for recipeBlock in recipeBlockContainer: recipeUrl = recipeBlock.div.a['href'] recipeUrlList.append(recipeUrl) with concurrent.futures.ThreadPoolExecutor() as executor: results = [ executor.submit(get_recipe_list, recipeUrl) for recipeUrl in recipeUrlList ] for f in concurrent.futures.as_completed(results): recipeBook.append(f.result()) recipeDict['allrecipes'] = recipeBook return recipeDict
def Wpvuldb_Api(data): # Will match in case data is a Wordpress version regexVer = re.compile(r"^(\d{1,3}\.\d{0,3}\.{0,1}\d{0,3})") if regexVer.match(data): # The query is about Wordpress vulnerabilities base = "https://wpvulndb.com/api/v3/wordpresses/" url = base + data.replace(".", "") else: # The query is about Plugins vulnerabilities (a regex to match could be made but.. if "data" isn't # a WP version is a plugin name(slug)) base = "https://wpvulndb.com/api/v3/plugins/" url = base + data token = "" authHeader = {"Authorization": "Token token=" + token} apiReq = Request(url, headers=authHeader) siteResponse = urlReq(apiReq).read() jsonResponse = json.loads(siteResponse.decode("ISO-8859-1")) listVuln = jsonResponse[data]["vulnerabilities"] if not listVuln: print("\n\tThere are no known vulnerabilities yet") for vuln in listVuln: vulName = vuln.get("title") vulType = vuln.get("vuln_type") vulDate = vuln.get("created_at") vulRef = vuln["references"].get("url") vulCVE = vuln["references"].get("cve") print("\n\n\tVulnerability name: " + vulName + "\n\tType: " + vulType + "\n\tDate: \ " + vulDate + "\n\tReferences: " + str(vulRef) + "\n\tCVE: " + str(vulCVE))
def getRecipeList(recipeUrl): # Opening the connection grabing webpage, store all raw information req = Request(recipeUrl, headers={'User-Agent': 'Mozilla/5.0'}) uClient = urlReq(req) htmlRaw = uClient.read() uClient.close() # Parse raw HTML soup = BeautifulSoup(htmlRaw, "html.parser") recipeCard = {} # Store URL into recipe card recipeCard['URL'] = recipeUrl # Find title and store into recipe card recipeTitle = soup.find("h1", {"class": "entry-title"}).text recipeCard['title'] = recipeTitle # Find image URL store into recipe card. imageContainer = soup.find("div", {"class", "featured-image"}) imageUrl = imageContainer.img['src'] recipeCard['image'] = imageUrl # Return single recipe return recipeCard
def getRecipeList(recipeUrl): # Opening the connection grabing webpage, store all raw information uClient = urlReq(recipeUrl) htmlRaw = uClient.read() uClient.close() # Parse raw HTML soup = BeautifulSoup(htmlRaw, "html.parser") recipeCard = {} # Store URL into recipe card recipeCard['URL'] = recipeUrl # Find title and store into recipe card recipeTitle = soup.find("span", { "class": "o-AssetTitle__a-HeadlineText" }).text recipeCard['title'] = recipeTitle # Find star rating and store into recipe card recipe_stars = soup.find("span", {"class": "gig-rating-stars"})["title"] recipeCard['stars'] = recipe_stars # Find image URL store into recipe card. recipeHeader = soup.find("div", {"class", "recipe-lead"}) imageContainer = recipeHeader.find( "img", {"class", "m-MediaBlock__a-Image a-Image"}) if imageContainer: imageUrl = imageContainer['src'] recipeCard['image'] = "http:" + imageUrl else: recipeCard['image'] = "https://i.imgur.com/bvzLAyR.jpg" # Return single recipe return recipeCard
def searchInNewegg(searchString, blockedWord, searchPageDepth, sortPreference, currency): searchString = searchString.replace(' ', '+') results = [] currentPage = 1 datetime = date.datetime.now() while currentPage <= searchPageDepth: if currentPage != 0: if currentPage <= (searchPageDepth + 1): urlSite = "https://www.newegg.com/p/pl?d=" + searchString + "&Page=" + str( currentPage) webSite = urlReq(urlSite) html = webSite.read() webSite.close() page_soup = soup(html, 'html.parser') itemsWholeGrid = page_soup.find('div', {'class': 'items-view is-grid'}) itemsWhole = itemsWholeGrid.findAll('div', {'class': 'item-container'}) for item in itemsWhole: def itemAnalysis(): #print('--------------------------------') text = item.find('div', {'class': 'item-info'}) name = str(text.find('a', {'class': 'item-title'}).text) price = str(text.find('li', {'class': 'price-current' }))[78:85].strip('</strong>').replace( ',', '') try: discount = str( text.find('span', { 'class': 'price-save-percent' }).text).strip('%') except: #print('discount not found') discount = 0 if discount == 'None': discount = 0 itemNumber = str(len(results) + 1) link = str(text.find( 'a', {'class': 'item-title' })['href']).partition('?')[0].strip('https://') results.append((str(itemNumber), str(price), name, link, str(discount), str(datetime), neweggDBPK)) #print("item #"+ itemNumber +": "+ name +" $"+ price + ' OFF: '+ discount ) bWordFound = 0 for bWord in blockedWord: if bWord in str(item): bWordFound += 1 if bWordFound == 0: itemAnalysis() currentPage = currentPage + 1 print('results in NewEgg :' + str(len(results))) if sortPreference == 'Increasing': return sortResults.sortIncreasing(results) if sortPreference == 'Decreasing': return sortResults.sortDecreasing(results)
def get_page_html(self): try: url_client = urlReq(self.url) page_html = url_client.read() url_client.close() return page_html except URLError as e: return 'HTTPError = ' + str(e)
def get_simply_recipe(recipeUrl): # Opening the connection grabing webpage, store all raw information req = Request(recipeUrl, headers={'User-Agent': 'Mozilla/5.0'}) uClient = urlReq(req) htmlRaw = uClient.read() uClient.close() # Parse raw HTML soup = BeautifulSoup(htmlRaw, "html.parser") recipeCard = {} # Store site name into recipe card recipeCard['siteName'] = "Simply Recipes" # Store URL into recipe card recipeCard['URL'] = recipeUrl # Find title and store into recipe card recipeTitle = soup.find("h1", {"class": "entry-title"}).text recipeCard['title'] = recipeTitle # Find image URL store into recipe card. imageContainer = soup.find("div", {"class", "featured-image"}) imageUrl = imageContainer.img['src'] recipeCard['image'] = imageUrl # Find metadata of the recipe metadataAry = [] recipePrep = soup.find("li", {"class": "recipe-prep"}).text.strip() recipeCook = soup.find("li", {"class": "recipe-cook"}).text.strip() recipeYield = soup.find("li", {"class": "recipe-yield"}).text.strip() metadataAry.append(recipePrep) metadataAry.append(recipeCook) metadataAry.append(recipeYield) recipeCard['metadata'] = metadataAry # Find ingredients ingredientListAry = [] ingredientList = soup.findAll("li", {"class": "ingredient"}) for ingredientItem in ingredientList: ingredient = ingredientItem.text ingredientListAry.append(ingredient) recipeCard['ingredients'] = ingredientListAry # Find Instructions instructionsAry = [] instructionsContainer = soup.find("div", {"id": "sr-recipe-method"}) instructions = instructionsContainer.findAll("p") for instructionItem in instructions: instruction = instructionItem.text.strip() instructionsAry.append(instruction) recipeCard['instructions'] = instructionsAry # Return single recipe return recipeCard
def getLyricsForSong(songName): try: uClient = urlReq(lyrics_url + songName + ".html") page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("p", {"class": "verse"}) speak("Here are the lyrics for that song\n") for container in containers: print(container.text) except urllib.error.HTTPError: speak(noLyricsMessageDict[random.randint(1, len(noLyricsMessageDict))])
def getPrice(): # Setting URL variables page_url = 'https://finance.yahoo.com/quote/FB?p=FB' urlClient = urlReq(page_url) page_html = urlClient.read() urlClient.close() # Parsing HTML page_soup = soup(page_html, "html.parser") # Grabbing current price curr_price = page_soup.find('div', { 'class': 'My(6px) Pos(r) smartphone_Mt(6px)' }).find('span').text return curr_price
def getUrls(ingredients, pageNum): # Search a few pages from main search result searchUrl = "https://www.foodnetwork.com/search/" + ingredients + "-/p/" + str( pageNum) + "/rating" recipeUrlList = urlReq(searchUrl) htmlRaw = recipeUrlList.read() recipeUrlList.close() soup = BeautifulSoup(htmlRaw, "html.parser") # Find all recipe URLs and return URL list recipeUrlList = [] recipeBlockContainer = soup.findAll("h3", {"class": "m-MediaBlock__a-Headline"}) for recipeBlock in recipeBlockContainer: recipeUrl = "https:" + recipeBlock.a['href'] if (not ("videos" in recipeUrl)): recipeUrlList.append(recipeUrl) return recipeUrlList
def getUrls(item, pageNum): # Search a few pages from main search result searchUrl = "https://www.simplyrecipes.com/recipes/main-ingredient/" + item + "/page/" + str( pageNum) + "/" req = Request(searchUrl, headers={'User-Agent': 'Mozilla/5.0'}) recipeUrlList = urlReq(req) htmlRaw = recipeUrlList.read() recipeUrlList.close() soup = BeautifulSoup(htmlRaw, "html.parser") # Find all recipe URLs and return URL list recipeUrlList = [] recipeBlockContainer = soup.findAll("h2", {"class": "grd-title-link"}) for recipeBlock in recipeBlockContainer: recipeUrl = recipeBlock.a['href'] recipeUrlList.append(recipeUrl) return recipeUrlList
def Plugins_Enum(): try: pwd = "./wp-content/plugins" plgFolders = [ subDir for subDir in Path(pwd).iterdir() if subDir.is_dir() ] rdmFile = "/README.txt" for slug in plgFolders: base = "https://api.wordpress.org/plugins/info/1.1/?action=query_plugins&request[search]=" url = base + slug.name siteResponse = urlReq(url).read() jsonResponse = json.loads(siteResponse.decode("ISO-8859-1")) firstMatch = jsonResponse["plugins"][0] plgName = firstMatch.get("name") plgVersion = firstMatch.get("version") plgAuthor = firstMatch.get("author_profile") plgSite = firstMatch.get("homepage") plgWPReq = firstMatch.get("requires") if Path(str(slug) + rdmFile).is_file(): instVersion = verParser(rdmFile, slug) elif Path(str(slug) + rdmFile.lower()).is_file(): instVersion = verParser(rdmFile.lower(), slug) # Show info from Wordpress Api for each plugin parsed print("\n\nPlugin name: " + plgName + "\x1b[1;31m" + "\nLatest version: " + plgVersion + "\nInstalled \ version:" + instVersion + "\x1b[0;m" + "\nAuthor: " + plgAuthor + "\nSite: " + plgSite + "\nWordpress \ version required: " + plgWPReq) # Show a list of vulnerabilities store in wpvulndb for each plugin parsed Wpvuldb_Api(slug.name) except: print( "The directory \"wp-content\" doesn't exist, therefore plugins cannot be enumerated" ) Handler()
def get_recipe_list(recipeUrl): # Opening the connection grabbing webpage, store all raw information uClient = urlReq(recipeUrl) htmlRaw = uClient.read() uClient.close() # Parse raw HTML soup = BeautifulSoup(htmlRaw, "html.parser") recipeCard = {} # Store URL into recipe card recipeCard['URL'] = recipeUrl # Find title and store into recipe card recipeTitle = soup.find("h1").text recipeCard['title'] = recipeTitle # Find the star rating and store into recipe card - grabbing the value in aria-label recipe_stars = soup.find('span', {'class', 'review-star-text'}) if recipe_stars: recipeCard['stars'] = recipe_stars.text.strip() else: stars_span = soup.find('span', {'class', 'stars stars-5'}) recipeCard['stars'] = stars_span['aria-label'] # Find image URL store into recipe card. imageContainer = soup.find("div", {"class", "image-container"}) if imageContainer: imageUrl = imageContainer.div['data-src'] else: imageContainer = soup.find("img", {"class", "rec-photo"}) imageUrl = imageContainer['src'] recipeCard['image'] = imageUrl # Return single recipe as dictionary return recipeCard
import bs4 from urllib.request import urlopen as urlReq from bs4 import BeautifulSoup as soup # changes on wed evga_url = 'https://www.evga.com/products/productlist.aspx?type=0' # get request for the webpage and store raw html url_client = urlReq(evga_url) raw_html = url_client.read() url_client.close() # use soup to performing parsing page_soup = soup(raw_html, "html.parser") all_product_containers = page_soup.find_all("div", attrs={"class": "list-item"}) filename = "listings.csv" f = open(filename, "w") headers = "name, original price, discount, final price, base clock, boost clock, VRAM, bandwidth, link\n" f.write(headers) for product in all_product_containers: # print(product.prettify()) details = [] img = product.div.a.img["src"] link = product.find("div", {"class": "pl-list-image"}).contents[1]["href"] name = product.find("div", {"class": "pl-list-image"}).contents[1]["title"] details_ul = product.find("div", {"class": "pl-list-info"}).ul.contents
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as urlReq page_url = "https://dmoz-odp.org/" urlClient = urlReq(page_url) page_soup = soup(urlClient.read(), "html.parser") urlClient.close() links = page_soup.findAll('a') out_filename = "dmoz_links.csv" headers = "link\n" f = open(out_filename, "w") f.write(headers) count = 0 for link in links: if (count < 49): link_store = link.get('href') temp_link = str(link_store) if (temp_link.find('https') != -1): print("Link :" + str(link_store)) f.write(str(link_store) + "\n") else: url = "https://dmoz-odp.org"
import bs4 from urllib.request import urlopen as urlReq from bs4 import BeautifulSoup as soup username = input("Enter Username :"******"https://github.com/" + username + "?tab=repositories" uClient = urlReq(myurl) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("div", {"class": "col-10 col-lg-9 d-inline-block"}) repo_count = page_soup.findAll("span", {"class": "Counter"}) print("Total number of commits: ", repo_count[0].text) filename = "Repo_list.csv" f = open(filename, "w") headers = "Repository_name, Language, Last_update\n" f.write(headers) for container in containers: repo = container.findAll("h3", {"class": "wb-break-all"}) repo_name = ((repo[0].text).replace(" ", "")).replace("\n", "") lang = container.findAll("span", {"itemprop": "programmingLanguage"})
def get_all_recipe(recipeUrl): # Opening the connection grabbing webpage, store all raw information uClient = urlReq(recipeUrl) htmlRaw = uClient.read() uClient.close() # Parse raw HTML soup = BeautifulSoup(htmlRaw, "html.parser") recipeCard = {} # Store URL into recipe card recipeCard['URL'] = recipeUrl # Find title and store into recipe card recipeTitle = soup.find("h1").text recipeCard['title'] = recipeTitle # Find image URL store into recipe card. imageContainer = soup.find("div", {"class", "image-container"}) if imageContainer: imageUrl = imageContainer.div['data-src'] else: imageContainer = soup.find("img", {"class", "rec-photo"}) imageUrl = imageContainer['src'] recipeCard['image'] = imageUrl # Find metadata of the recipe metadataAry = [] recipeMetadata = soup.findAll("div", {"class": "recipe-meta-item"}) if recipeMetadata: for metadata in recipeMetadata: metadataHeader = metadata.find( "div", {"class", "recipe-meta-item-header"}).text.strip() metadataBody = metadata.find( "div", {"class", "recipe-meta-item-body"}).text.strip() metadataEntry = metadataHeader + ' ' + metadataBody metadataAry.append(metadataEntry) else: recipeMetadata = soup.findAll("li", {"aria-label": True}) for metadata in recipeMetadata: metadataEntry = metadata['aria-label'] metadataAry.append(metadataEntry) recipeCard['metadata'] = metadataAry # Find ingredients ingredientListAry = [] ingredientList = soup.findAll("li", {"class": "ingredients-item"}) if ingredientList: for ingredientItem in ingredientList: ingredient = ingredientItem.find("span", { "class": "ingredients-item-name" }).text.strip() ingredientListAry.append(ingredient) else: ingredientList = soup.findAll("label", {"title": True}) for ingredientItem in ingredientList: ingredient = ingredientItem['title'] ingredientListAry.append(ingredient) recipeCard['ingredients'] = ingredientListAry # Find Instructions instructionsAry = [] instructionsContainer = soup.findAll( "li", {"class": "subcontainer instructions-section-item"}) if instructionsContainer: for instructionItem in instructionsContainer: instruction = instructionItem.p.text instructionsAry.append(instruction) else: instructionsContainer = soup.findAll( "span", {"class": "recipe-directions__list--item"}) for instructionItem in instructionsContainer: instruction = instructionItem.text.strip() instructionsAry.append(instruction) recipeCard['instructions'] = instructionsAry # Return single recipe as dictionary return recipeCard
# Importing files for web scraping from urllib.request import urlopen as urlReq from bs4 import BeautifulSoup as soup # Setting URL variables test_url = 'https://www.newegg.ca/Cell-Phones-Unlocked/SubCategory/ID-2961?Tid=165973' urlClient = urlReq(test_url) page_html = urlClient.read() urlClient.close() # Parsing HTML page_soup = soup(page_html, "html.parser") # Finding desired content containers = page_soup.findAll("div", {"class": "item-container"}) # Creating files filename = "products.csv" f = open(filename, "w") headers = "brand, title, total_price, old_price, percent_saved, ship_cost\n" f.write(headers) # --------------------------- FUNCTIONS --------------------------- # Find elements with classes function def findClass(contain, elem, class_name, index): return contain.findAll(elem, {"class": class_name})[index] # Find total price of item
#GET JSON OBJECT FOR ONE PAGE from urllib.request import urlopen as urlReq from bs4 import BeautifulSoup as soup import json import dateutil.parser as parser data = {} contract_url = "https://www.sourcewell-mn.gov/cooperative-purchasing/022217-wex" urlClient = urlReq(contract_url) page_html = urlClient.read() urlClient.close() p_soup = soup(page_html, "html.parser") container = p_soup.find("div", {"class": "vendor-contract-header__content"}) data['title'] = container.findAll('p')[0].text str = container.findAll('p')[1].text.split('\n') date = str[1].split('Maturity Date:')[1].strip() data['expiration'] = parser.parse(date).isoformat() str = str[0].replace('#', '') data['contract_number'] = str name = str.split('-')[1] files = {} files["contract-forms"] = p_soup.findAll( "div", {"class": "field--item"})[2].findAll('span')[3].a["href"] data["files"] = [files] vendor = {} contacts = {} vendor["name"] = name
def Pull_Site(): url = "http://www.safa.edu" siteResponse = urlReq(url) parsedPage = bSoup(siteResponse, "html.parser") return (parsedPage)
from urllib.request import urlopen as urlReq from bs4 import BeautifulSoup as soup ############################## For Flipkart #################################################### my_url = "https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&as-pos=0&as-type=HISTORY&as-backfill=on" ############ Request to a URL for getting all the data vClient = urlReq(my_url) ############ Read all the content of that page page_html = vClient.read() ############ Close request object vClient.close() ########## Parse all the html data by beautifull Soup page_soup = soup(page_html,"html.parser") ########## Find all the content of any div by class name containers = page_soup.find_all("div",{"class":"_1UoZlX"}) #print(len(containers)) ########## Will convert all the data at html form which is written #print(soup.prettify(containers[0])) #for one product #container = containers[0] #print(container) ########## Product Name #prdname = container.find("div",{"class":"_3BTv9X"}) #print(soup.prettify(prdname)) #prod_name = prdname.img["alt"] #print(prod_name) ########### Rating#################################
import bs4 from urllib.request import urlopen as urlReq from bs4 import BeautifulSoup as soup import csvWriter searchString = "moto" blockWords = ["None", 'screen', 'Protector', 'case', 'film'] searchPageDepth = 4 currentPage = 0 urlSite = "https://www.amazon.com/s?k=" + searchString + "&ref=nb_sb_noss_2" webSite = urlReq(urlSite) html = webSite.read() webSite.close() page_soup = soup(html, 'html.parser') results = [] while currentPage < searchPageDepth: if currentPage != 0: if currentPage <= searchPageDepth: urlSite = 'https://amazon.com' + str( page_soup.find('li', { 'class': 'a-last' }).a['href']) + '/' webSite = urlReq(urlSite) html = webSite.read() webSite.close() page_soup = soup(html, 'html.parser') itemsWhole = page_soup.findAll( 'span', {'cel_widget_id': 'SEARCH_RESULTS-SEARCH_RESULTS'}) for item in itemsWhole: text = str( item.find('span',
num_images, folder_name = parse_arguments() with open('images_urls.txt', 'w') as file: last_country = str() driver.get(url) button = driver.find_element_by_class_name('intro__explore') button.click() while len(image_urls ) != num_images: #get input for number of images at least time.sleep(.2) url = driver.current_url uClient = urlReq(url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") images = page_soup.find_all('img', {'src': re.compile('.jpg')}) country_name = page_soup.find_all('div', {'class': 'location__country'}) try: for image in images: curr_url = image['src'] if curr_url not in image_urls: #new image image_urls.add(curr_url) curr_name = str() try: curr_name += country_name[0].previous
def searchInAmazon(searchString, blockedWord, searchPageDepth, sortPreference, currency): datetime = date.datetime.now() searchString = searchString.replace(' ', '+') currentPage = 0 urlSite = "https://www.amazon.com/s?k=" + searchString + "&ref=nb_sb_noss_2" webSite = urlReq(urlSite) html = webSite.read() webSite.close() page_soup = soup(html, 'html.parser') results = [] while currentPage < searchPageDepth: if currentPage != 0: if currentPage <= searchPageDepth: urlSite = 'https://amazon.com' + str( page_soup.find('li', { 'class': 'a-last' }).a['href']) + '/' webSite = urlReq(urlSite) html = webSite.read() webSite.close() page_soup = soup(html, 'html.parser') itemsWhole = page_soup.findAll( 'span', {'cel_widget_id': 'SEARCH_RESULTS-SEARCH_RESULTS'}) for item in itemsWhole: def itemAnalysis(): if 'App' and 'Prime Video' not in str(item): text = str( item.find('span', { 'class': 'a-size-medium a-color-base a-text-normal' })) name = text.strip( '<span class="a-size-medium a-color-base a-text-normal" dir="auto">' ).strip('</') if (item.find('free') or item.find('FREE')): price = fullPrice = '0' else: try: price = str( item.find('span', { 'class': 'a-price-whole' }).text).replace(',', '').strip('.') fullPriceSpan = item.find( 'span', {'data-a-strike': 'true'}) try: fullPrice = str( fullPriceSpan.find('span', { 'class': 'a-offscreen' }).text).strip('$').partition('.')[0] except: fullPrice = price try: if fullPrice != price: discount = str( 100 - round(float(price), 2) * 100 / round(float(fullPrice), 2)).partition( '.')[0] #print(discount) else: discount = '0' except ValueError as err: discount = '0' itemNumber = str(len(results)) link = ('amazon.com' + item.find( 'a', {'class': 'a-link-normal a-text-normal' })['href']).partition('ref')[0] img = item.find('img', {'class': 's-image'})['src'] print(img) results.append( (itemNumber, price, name, link, discount, str(datetime), amazonDBPK, img)) except AttributeError as err: pass #print('Item Skipped in Amazon due to: ' +str(err)) bWordFound = 0 for bWord in blockedWord: if bWord in str(item): bWordFound += 1 if bWordFound == 0: itemAnalysis() currentPage = currentPage + 1 print('results in Amazon :' + str(len(results))) if sortPreference == 'Increasing': return sortResults.sortIncreasing(results) if sortPreference == 'Decreasing': return sortResults.sortDecreasing(results)
from urllib.request import urlopen as urlReq #grab page from bs4 import BeautifulSoup as soup #parse txt import json import dateutil.parser as parser from urllib.request import Request from socket import timeout import logging pageNames = [] numPages = 9 header = {'User-Agent': 'CoProcure Technical Challenge'} for x in range(0, numPages): basePageUrl = 'https://www.sourcewell-mn.gov/contract-search?category=All&keyword=&page=' + str( x) urlClient = urlReq(basePageUrl) page_html = urlClient.read() urlClient.close() p_soup = soup(page_html, "html.parser") container = p_soup.findAll( "p", {"class": "component__search-vendors-contracts-number"}) for n in container: pageNames.append(n.text.replace('#', '').strip()) contract_data = [] for i in pageNames: data = {} contract_url = "https://www.sourcewell-mn.gov/cooperative-purchasing/" + i req = Request(contract_url, headers=header)
def get_foodnetwork(recipeUrl): # Opening the connection grabbing web page, store all raw information uClient = urlReq(recipeUrl) htmlRaw = uClient.read() uClient.close() # Parse raw HTML soup = BeautifulSoup(htmlRaw, "html.parser") recipeCard = {} # Store site name into recipe card recipeCard['siteName'] = "Food Network" # Store URL into recipe card recipeCard['URL'] = recipeUrl # Find title and store into recipe card recipeTitle = soup.find("span", { "class": "o-AssetTitle__a-HeadlineText" }).text recipeCard['title'] = recipeTitle # Find image URL store into recipe card. recipeHeader = soup.find("div", {"class", "recipe-lead"}) imageContainer = recipeHeader.find( "img", {"class", "m-MediaBlock__a-Image a-Image"}) if imageContainer: imageUrl = imageContainer['src'] recipeCard['image'] = "http:" + imageUrl else: recipeCard['image'] = "https://i.imgur.com/bvzLAyR.jpg" # Find metadata of the recipe metadataAry = [] levelInfo = soup.find("div", {"class": "o-RecipeInfo"}) itemsInfo = levelInfo.findAll("li") for item in itemsInfo[:5]: item = item.text.replace('\n', ' ').strip() metadataAry.append(item) recipeCard['metadata'] = metadataAry # Find ingredients ingredientListAry = [] ingredientList = soup.findAll("p", {"class": "o-Ingredients__a-Ingredient"}) for ingredientItem in ingredientList: ingredient = ingredientItem.text ingredientListAry.append(ingredient) recipeCard['ingredients'] = ingredientListAry # Find Instructions instructionsAry = [] instructionsContainer = soup.findAll("li", {"class": "o-Method__m-Step"}) for instructionItem in instructionsContainer: instruction = instructionItem.text.strip() instructionsAry.append(instruction) recipeCard['instructions'] = instructionsAry # Return single recipe return recipeCard
def searchInMercadoLibre(searchString, blockedWord, searchPageDepth, sortPreference, currency): searchString = searchString.replace(' ','+') currentPage = 0 datetime = date.datetime.now() if currency == 'USD' : urlSite = 'http://www.dolarhoy.com/' webSite = urlReq(urlSite) html = webSite.read() webSite.close() page_soup = soup(html, 'html.parser') usdContainer = page_soup.find('div',{'class':'pill pill-coti'}) usdCompra = str(usdContainer.findAll('span')[1:2])[23:30].strip('</').replace(',','.') print('USD Compra= '+str(usdCompra)) urlSite = "https://listado.mercadolibre.com.ar/" + searchString +'/' webSite = urlReq(urlSite) html = webSite.read() webSite.close() page_soup = soup(html, 'html.parser') results=[] while currentPage < searchPageDepth : if currentPage != 0 : if currentPage <= searchPageDepth : urlSite = str(page_soup.find('li', {'class':'andes-pagination__button andes-pagination__button--next'}).a['href']) webSite = urlReq(urlSite) html = webSite.read() webSite.close() page_soup = soup(html, 'html.parser') itemsWhole = page_soup.findAll('li',{'class':'results-item highlighted article stack'}) for item in itemsWhole: def itemAnalysis(): #print('--------------------------------') text = str(item.find('span', {'class':'main-title'})) name=text.strip('<span class="main-title"> ').strip('</span>') #price = str(item.find('span',{'class':'price__fraction'}))[29:36].strip('</span>').replace('.','').replace(',','') try: price = str(item.find('span',{'class':'price__fraction'}).text).replace('.','') except AttributeError as err: try: price = str(item.find('div',{'class':'pdp_options__text pdp_options--no-margin'}).text.strip(' $ ').partition(' ')[0]).replace('.','') except AttributeError as err : price = str(str(item.find('div',{'class':'pdp_options__text pdp_options--no-margin'})).strip(' $ ').partition(' ')[0]).replace('.','') if price == 'None': print('econtro precios en None') price = str(item.find('div',{'class':'pdp_options__text pdp_options--no-margin'})).partition('<span>$')[2].partition('</span>')[0].replace('.','') print('price: '+price) if currency == 'USD' : price = float(price) / float(usdCompra) price = str(round(price, 2)) try: discount = str(item.find('div',{'class':'item__discount'}).text).strip('% OFF') except AttributeError : discount = '0' if discount == 'None' : discount = '0' itemNumber = str(len(results)) link = str(item.a['href']) if 'JM' in link: link = link.partition('JM')[0]+'JM' else : link = link.partition('?')[0] img = str(item.find('img',{})).partition('src="')[2].partition('"')[0] print(img) results.append((itemNumber, price, name, link.strip('https://'), discount, str(datetime), mercadolibreDBPK, img)) #print("item #"+ itemNumber +": "+ name +" $"+ price + ' OFF: '+ discount ) bWordFound = 0 for bWord in blockedWord: if bWord in str(item): bWordFound+=1 if bWordFound == 0 : itemAnalysis() currentPage=currentPage+1 print('results in MercadoLibre :' + str(len(results))) if sortPreference == 'Increasing' : return sortResults.sortIncreasing(results) if sortPreference == 'Decreasing' : return sortResults.sortDecreasing(results)
import bs4 from urllib.request import urlopen as urlReq from bs4 import BeautifulSoup as bSoup flip_url = 'https://www.flipkart.com/search?q=laptop&sid=6bo%2Cb5g&as=on&as-show=on&otracker=AS_QueryStore_OrganicAutoSuggest_3_7_na_na_na&otracker1=AS_QueryStore_OrganicAutoSuggest_3_7_na_na_na&as-pos=3&as-type=RECENT&suggestionId=laptop%7CLaptops&requestId=3ce158b5-e07c-46b0-b18f-a727831d42c4&as-searchtext=laptop' uClient = urlReq(flip_url) html_page = uClient.read() uClient.close() soup_page = bSoup(html_page, "html.parser") containers = soup_page.findAll("div", {"class": "_1-2Iqu row"}) filename = "laptops.csv" f = open(filename, "w") headers = "Model, Processor, RAM, ROM, Display, Price\n" f.write(headers) for container in containers: model_name = container.findAll("div", {"class": "_3wU53n"}) model = model_name[0].text price_container = container.findAll("div", {"class": "_1vC4OE _2rQ-NK"}) price = price_container[0].text content = container.findAll("li", {"class": "tVe95H"}) processor = content[0].text ram = content[1].text