def mainPageScrape(f): address = "https://www.newegg.com/Processors-Desktops/SubCategory/ID-343" # opening up connection grabbing the page uClient = UReq(address) page_html = uClient.read() uClient.close() # html parsing page_soup = soup(page_html, "html.parser") # add each processor item container to a list of containers containers = page_soup.findAll("div", {"class": "item-container"}) for container in containers: list = (containerScrape(container)) csv_string = list[0] + "," + list[1] + "," + list[2] + "," + list[3] + "," + list[4] + "," + list[5] + "," + \ list[6] if descriptionlog.__contains__(list[1]): print("Duplicate processor found. Not writing to list.") else: descriptionlog.append(list[1]) print(csv_string) f.write(csv_string + "\n") containers.clear()
from urllib.request import urlopen as UReq from bs4 import BeautifulSoup as beau myurl = 'https://campinascomprelocal.com.br/tipo/bares/' print(myurl) # open connection page uClient = UReq(myurl) page_html = uClient.read() uClient.close() soup = beau(page_html, 'lxml') contents = soup.title print(contents)
from bs4 import BeautifulSoup as soup import requests from urllib.request import urlopen as UReq # spørgsmål 1: Webscrape alle titlerne på artiklerne som vises på denne side og print dem alle samt antallet af dem my_url = 'https://www.dr.dk/nyheder/tema/coronavirus' uClient = UReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, 'html.parser') # print(page_soup.title) # grabs each product containers = page_soup.findAll("span", {"class": "dre-teaser-title__text"}) # print(containers) print(len(containers)) for container in containers: title_container = container.findAll("span", "dre-compact-teaser__title") article_name = title_container[0].text print(article_name)
def getPage(self): uClient = UReq(self.url) self.page_html = uClient.read() uClient.close()
def remainingPagesScrape(f): page = 2 duplicateCount = 0 link = 'https://www.newegg.com/Processors-Desktops/SubCategory/ID-343/Page-' while True: try: address = link + str(page) print() print("Preparing to Scrape Page: " + str(page)) print("Address: " + address) print() # opening up connection grabbing the page uClient = UReq(address) page_html = uClient.read() uClient.close() # html parsing page_soup = soup(page_html, "html.parser") # add each processor item container to a list of containers containers = page_soup.findAll("div", {"class": "item-container"}) for container in containers: list = (containerScrape(container)) csv_string = list[0] + "," + list[1] + "," + list[ 2] + "," + list[3] + "," + list[4] + "," + list[ 5] + "," + list[6] if descriptionlog.__contains__(list[1]): print("Duplicate processor found. Not writing to list.") duplicateCount = duplicateCount + 1 else: descriptionlog.append(list[1]) print(csv_string) f.write(csv_string + "\n") containers.clear() if duplicateCount > 100: print() print( "Duplicate Count Is " + str(duplicateCount) + ". This Suggests The Data Is Being Reiterated. The Script Will Stop." ) print("Processor Scrape Complete") print() print("Traversed " + str(page) + " Pages") print( str(descriptionlog.__len__()) + " Unique Processors Found") print() print("Data Written To: " + f.name) f.close() break page = page + 1 except IndexError as e: print() page = page + 1 # f.close() print("So Far We Have Traversed " + str(page - 1) + " Pages") print(str(descriptionlog.__len__()) + " Unique Processors Found") print(str(duplicateCount) + " Duplicates Ignored")