def crear_calendario(temp, path): global partidos_df page_soup = BS(uOpen(path).read(), 'html.parser') jornadas = page_soup.find_all('div',{'class': 'jornada-calendario-historico'}) for jornada in jornadas: numero_jornada_y_fecha = jornada.div.text jor, fecha = filtrar_jornada(numero_jornada_y_fecha) partidos = jornada.findAll('td') for j, partido in enumerate(partidos): resultado = partido.text eq_loc, eq_vis, gol_loc, gol_vis = filtrar_resultado(resultado) eq_loc, eq_vis = limpiar_nombre(eq_loc), limpiar_nombre(eq_vis) eq_loc, eq_vis = buscar_equivalencia(eq_loc), buscar_equivalencia(eq_vis) res = pd.DataFrame([[temp, jor, j+1, fecha, eq_loc, eq_vis, gol_loc, gol_vis]], columns=list(partidos_df)) partidos_df = partidos_df.append(res) partidos_df = partidos_df.reset_index() partidos_df.drop('index', axis=1, inplace=True)
def crear_calendario(temp, path): global partidos_df page_soup = BS(uOpen(path).read(), 'html.parser') rounds = page_soup.find_all('div', {'class': 'jornada calendarioInternacional'}) for r in rounds: rnd = r.caption.text # Get the name of the round i.e. Jornada 1 matches = r.findAll('tr') # Find all the matches in that round for j, match in enumerate(matches[1:]): loc = match.find('td', {'class': 'local'}).span.text away = match.find('td', {'class': 'visitante'}).span.text loc, away = limpiar_nombre(loc), limpiar_nombre(away) loc, away = buscar_equivalencia(loc), buscar_equivalencia(away) res = pd.DataFrame([[rnd, j + 1, loc, away]], columns=list(partidos_df)) partidos_df = partidos_df.append(res) partidos_df = partidos_df.reset_index() partidos_df.drop('index', axis=1, inplace=True)
def read_page(page_url): #Request page client = uOpen(page_url) page_html = client.read() client.close() return page_html
def get_poke_soup(link): uClient = uReq(link, headers={'User-Agent': 'Magic Browser'}) uCon = uOpen(uClient) poke_page_html = uCon.read() uCon.close() return soup(poke_page_html, 'html.parser')
def url_to_image(url): ''' Función para extraer una imagen de una URL ''' resp = uOpen(url) image = np.asarray(bytearray(resp.read()), dtype='uint8') image = cv2.imdecode(image, cv2.IMREAD_COLOR) return image
def news_titl(self): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } news_url = 'https://economictimes.indiatimes.com/' uClient = uOpen(Request( news_url, headers=header)) # requesting the webpage from the internet newsPage = uClient.read() # reading the webpage uClient.close() # closing the connection to the web server news_html = bs(newsPage, "html.parser") # parsing the webpage as HTML bigboxes = news_html.findAll( 'ul', {"class": "newsList clearfix" }) # searching for appropriate tag to get news titles box = bigboxes[0].contents # taking the first iteration (for demo) del box[10:] # deleting news more than 10 counts news = [] # initializing an empty list for news title for b in box: topicLink = "https://economictimes.indiatimes.com/" + b.a[ 'href'] # extracting the actual product link topicRes = uOpen(Request( topicLink, headers=header)) # getting the product page from server topic_html = bs(topicRes, "html.parser") # parsing the product page as HTML title_content = topic_html.findAll( 'div', {"class": "topPart clearfix tac fixedOnLoad" }) # searching for appropriate tag to get news titles body_content = topic_html.findAll( 'div', {"class": "artSyn bgPink" }) # searching for appropriate tag to get news article title = title_content[0].h1.text content = body_content[0].h2.text my_dict = {"Title": title, "Article": content} # fns = main_functions() # fns.store_raw_news(collection = collection, db_name = db_name, json = my_dict) news.append(my_dict) return news
def get_item_data(self, item_data): '''Get item data''' url = 'https://eu.api.battle.net/d3/data/item/{}?locale={}&apikey={}'.format( item_data, self.LOCALE, self.API_KEY) uClient = uOpen(url) output = uClient.read() uClient.close() parsed_output = ujson.loads(output) return parsed_output
def get_hero_profile(self, battleTag, heroID): '''Get hero profile''' battleTag = str(battleTag).replace('#', '%23') url = 'https://eu.api.battle.net/d3/profile/{}/hero/{}?{}&apikey={}'.format( battleTag, heroID, self.LOCALE, self.API_KEY) uClient = uOpen(url) output = uClient.read() uClient.close() parsed_output = ujson.loads(output) return parsed_output
def index(): if request.method == 'GET': try: searchString = "news_titles" dbConn = pymongo.MongoClient("mongodb://localhost:27017/") # opening a connection to Mongo db = dbConn['newscrawlerDB'] # connecting to the database called crawlerDB news = db[searchString].find({}) # searching the collection with the name same as the keyword if news.count() > 0: # if there is a collection with searched keyword and it has records in it return render_template('results.html', news=news) # show the results to user else: header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'} news_url = 'https://economictimes.indiatimes.com/' uClient = uOpen(Request(news_url, headers = header)) # requesting the webpage from the internet newsPage = uClient.read() # reading the webpage uClient.close() # closing the connection to the web server news_html = bs(newsPage, "html.parser") # parsing the webpage as HTML bigboxes = news_html.findAll('ul', {"class": "newsList clearfix"}) # searching for appropriate tag to get news titles box = bigboxes[0].contents # taking the first iteration (for demo) del box[10:] # deleting news more than 10 counts news = [] # initializing an empty list for news title table = db[searchString] # creating a collection with the same name as search string. Tables and Collections are analogous. for b in box: topicLink = "https://economictimes.indiatimes.com/" + b.a['href'] # extracting the actual product link topicRes = uOpen(Request(topicLink, headers = header)) # getting the product page from server topic_html = bs(topicRes, "html.parser") # parsing the product page as HTML title_content = topic_html.findAll('div', {"class": "topPart clearfix tac"}) # searching for appropriate tag to get news titles body_content = topic_html.findAll('div', {"class": "artSyn bgPink"}) # searching for appropriate tag to get news titles title = title_content[0].h1.text content = body_content[0].h2.text my_dict = {"Title": title, "Article": content} x = table.insert_one(my_dict) # insertig the dictionary containing the news comments to the collection news.append(my_dict) return render_template('results.html', news=news) except: return 'something is wrong'
def getPokeImage(pokeURL): # getting the basename of the pokemon pokeBaseName = pokeURL.split('/')[-1] #Charizard is the pokeBaseName for Charizard and the two Mega forms Mega Charizard X and Mega Charizard Y # open the URL for that pokemon and read in the html pokemon = uOpen(pokeURL) pokePage = pokemon.read() pokeSoup = soup(pokePage, 'html.parser') # pictures are linked to in divs of class profile-images pokeProfile = pokeSoup.findAll('div',{'class':'profile-images'})[0] pokeImages = pokeProfile.findAll('img') # the 'alt' attribute stores the names of the pokemons pokeNames = [pokeImage['alt'] for pokeImage in pokeImages] # the 'src' attribute stores the links to the pictures of the pokemons pokePicLinks = [pokeImage['src'] for pokeImage in pokeImages] pokePicTuple = list(zip(pokePicLinks, pokeNames)) # given the name and the link of a pokemon, uRetrieve can download its picture for pokePicLink,pokeName in pokePicTuple: # sometimes the 'alt' attribute does have the basename of the pokemon, so need to add it in to the name of the png file. if pokeBaseName not in pokeName: pokePic = uRetrieve(pokePicLink, pokeBaseName + ' ' + pokeName + '.png') else: pokePic = uRetrieve(pokePicLink, pokeName + '.png')
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uOpen filename = "pc_games2.csv" file = open(filename, "w") file.close() file = open(filename, "a+") headers = "name,company,price,save\n" file.write(headers) for i in range(1, 11): myUrl = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=100007756&IsNodeId=1&Description=pc%20games&page={}&bop=And&PageSize=36&order=BESTMATCH'.format( i) Client = uOpen(myUrl) page = Client.read() Client.close() html_page = soup(page, "html.parser") containers = html_page.findAll("div", {"class": "item-container"}) for container in containers: name = container.a.img["title"] b = (container.find("div", "item-action").ul.li.text).strip() old_price = b[0:6] c = (container.find("div", "item-action").ul).find( "li", "price-current").text.strip() new_price = c[0:6] #save=(container.find("div","item-action").ul).find("li","price-save").find("span","price-save-percent").text.strip() print("\n") print("name " + name) try: company = container.find("div", "item-info").img["title"] except:
from urllib.request import urlopen as uOpen, Request as uReq from bs4 import BeautifulSoup as soup, NavigableString uClient = uReq('https://bulbapedia.bulbagarden.net/w/index.php?title=Category:Pok%C3%A9mon_that_are_part_of_a_three-stage_evolutionary_line&pagefrom=Raichu+%28Pok%C3%A9mon%29#mw-pages', headers={'User-Agent': 'Magic Browser'}) uCon = uOpen(uClient, None, 5) poke_page_html = uCon.read() uCon.close() ps = soup(poke_page_html, 'html.parser') ps = ps.find(attrs={'id': 'mw-content-text'}).find('div', attrs={'class': 'mw-category'}) with open('d.txt', 'a') as f: ps = ps.find_all('a') ps = [p.text.replace(' (Pokémon)', '\n') for p in ps] for p in ps: f.write(p)
def connect(url): '''Connect to url and return html source as string''' uClient = uOpen(url) html_source = uClient.read() uClient.close() return html_source
import re import os # associates the windows clear terminal command with a simpler name clear = lambda: os.system('cls') # this script scrapes the list of all steam products (including bundles, games, # videos, music, and software) # at 'https://store.steampowered.com/search/?sort_by=Name_ASC', # and calculates the current mean user rating of all the products # on the steam platform. my_url = "https://store.steampowered.com/search/?sort_by=Name_ASC" # opening connection and downloading the page uClient = uOpen(my_url) page_html = uClient.read() uClient.close() # instantiating html parser page_soup = soup(page_html, "html.parser") # gets all steam games on the page games = page_soup.findAll("a", {"class": "search_result_row"}) # creates an empty array which is later populated with values for product review scores reviewScores = [] # creates an integer value for the last catalogue page number lastPageNum = 0 pageNumTags = page_soup.findAll(
# print("vignvig") StopUpdate = "y" return [tempNINI, StopUpdate] else: tempNINI.update({str(EpiUID): [SerialUID, EpiImage]}) except: pass return [tempNINI, StopUpdate] # ************************************************ start_time = time.time() MasterSerialList = uOpen( 'https://raw.githubusercontent.com/pravanjam/TamilSerialz/master/Master_Serial_List.csv' ) dataMst = StringIO(MasterSerialList.read().decode('ascii', 'ignore')) dreader = csv.reader(dataMst) Serial_Mst = [] SerialMeta = [] for row in dreader: Serial_Mst.append(row) for SerialListID in range(1, len(Serial_Mst)): # print(Serial_Mst[SerialListID][1]) SerialMeta.append({ 'SearchID': Serial_Mst[SerialListID][1], 'bkURL': Serial_Mst[SerialListID][2], 'Genre': Serial_Mst[SerialListID][3],
def urlParser(my_url, parserType): urlHTML = uOpen(my_url) page_html = urlHTML.read() urlHTML.close() PParser = bSoup(page_html, parserType) return PParser
def request_web(url): request = uOpen(url) html_file = request.read() return html_file
ju = jugador.replace(minuto, '') ju = limpiar_nombre(ju, [], stopwords=matches_stopwords) ju = ju.replace('()', '').rstrip() return mi, ju temps = list(url_temporadas.keys()) for temp in temps: #temp = temps[0] url_jornadas = url_temporadas[temp] for jor, url in enumerate(url_jornadas): url = url_jornadas[0] page = BS(uOpen(url).read(), 'html.parser') page.find('main') partidos = page.find('div', {'class': 'resultados borde-caja'}) partidos = partidos.find('table') partidos = partidos.findAll('tr') for n, partido in enumerate(partidos): eq_loc = partido.find('td', {'class':'equipo-local'}).text eq_vis = partido.find('td', {'class':'equipo-visitante'}).text eq_loc, eq_vis = limpiar_nombre(eq_loc), limpiar_nombre(eq_vis) eq_loc, eq_vis = buscar_equivalencia(eq_loc), buscar_equivalencia(eq_vis) resultado = partido.find('td', {'class':'resultado'}).text
from urllib.request import urlopen as uOpen from bs4 import BeautifulSoup as soup target = 'https://www.newegg.ca/Product/ProductList.aspx?Submit=ENE&IsNodeId=1&N=100007708%20600536049%20600536050%20600565061%20600565504%20600565674%20601107975%20601203793%20601204369%20601210955%20601205646%20601202919%20601203927%20601203901%20601294835%20601295933%20601194948%20601296707&cm_sp=Cat_video-Cards_1-_-Visnav-_-Gaming-Video-Cards_2' #open connection, grab page, and then close connection Client = uOpen(target) html = Client.read() Client.close() #html parsing soupy = soup(html, "html.parser") #grabbing each product containers = soupy.findAll("div", {"class": "item-container"}) print("Welcome to the NewEgg GPU WebScraper!\n") for container in containers: manufacturer = container.div.div.a.img["title"] title_container = container.findAll("a", {"class": "item-title"}) title = title_container[0].text shipping_container = container.findAll("li", {"class": "price-ship"}) shipping = shipping_container[0].text.strip() print("Manufacturer: " + manufacturer) print("Title: " + title) print("Shipping: " + shipping + "\n")
f.close() except: print("Type in PC Part Picker URL you which to price track:") url = input() f = open(filename, "w") headers = "URL, Date_Time, PC_Price, ChangeInPrice\n" f.write(headers) now = datetime.datetime.now() my_url = url hdr = {'User-Agent': 'Mozilla/5.0'} #opening Client try: req = uReq(my_url, headers=hdr) uClient = uOpen(req) page_html = uClient.read() uClient.close() except: f.close() print("Could not Open URL... Try a again with different url") os.remove(filename) quit() #parses the html page page_soup = soup(page_html, "html.parser") #Gets the current Price of the PC Build Prices = page_soup.findAll("tr", {"class": "total-price part-list-totals"}) price = Prices[0] buildprice = '"' + price.find("td", {"class": "tr nowrap"}).text + '"' print("Current PC Build Costs: " + buildprice)
"Provide the full path where CSV reports shall be stored ... : ") CSVfile = "Flipkart_INTERACTIVE_%s.CSV" % DateStamp #Assuming you run from scripts directory #OutCSV = open(CSVpath + "/" + CSVfile, 'w', newline='') OutCSV = open(CSVfile, 'w', encoding="utf-8", newline='') OutWriter = csv.writer(OutCSV) #print("SlNo.|itemName|rating|price|oldPrice|discount") print("\n Ouput will be displayed in a moment ... \n") #OutWriter.writerow("SlNo.|itemName|rating|price|oldPrice|discount") OutWriter.writerow("IRPOD") for pg in range(0, (int(pages))): URL = (baseURL + "&page=" + str(pg)) #print("\n\n\n ############# \n Now URL is : " + URL) #URL = 'https://www.flipkart.com/audio-video/pr?sid=0pm&marketplace=FLIPKART&offer=nb:mp:1154f86928,nb:mp:11cc851a28&hpid=u0KJH80uWRAYeEJJpMIZYap7_Hsxr70nj65vMAAFKlc=&fm=neo%2Fmerchandising&iid=M_62ce2069-ba72-4633-a9f3-272c137582ba_2.VLO9AZPF3DJW&ppt=clp&ppn=dotd-store&ssid=m03cg1ws6o0000001609272953413&otracker=clp_omu_infinite_Deals%2Bof%2Bthe%2BDay_2_2.dealCard.OMU_INFINITE_dotd-store_dotd-store_VLO9AZPF3DJW&cid=VLO9AZPF3DJW' #URL = 'https://www.flipkart.com/audio-video/pr?sid=0pm&marketplace=FLIPKART&offer=nb%3Amp%3A1154f86928%2Cnb%3Amp%3A11cc851a28&hpid=u0KJH80uWRAYeEJJpMIZYap7_Hsxr70nj65vMAAFKlc%3D&fm=neo%2Fmerchandising&iid=M_62ce2069-ba72-4633-a9f3-272c137582ba_2.VLO9AZPF3DJW&ppt=clp&ppn=dotd-store&ssid=m03cg1ws6o0000001609272953413&otracker=clp_omu_infinite_Deals%2Bof%2Bthe%2BDay_2_2.dealCard.OMU_INFINITE_dotd-store_dotd-store_VLO9AZPF3DJW&cid=VLO9AZPF3DJW&page=2' uReq = uOpen(URL) HtmlPage = uReq.read() uReq.close() PageSoup = soup(HtmlPage, "html.parser") containers = PageSoup.find_all("div", {"class": "_4ddWXP"}) ratingsAll = PageSoup.find_all("div", {"class": "_3LWZlK"}) reviewsAll = PageSoup.find_all("span", {"class": "_2_R_DZ"}) pricesAll = PageSoup.find_all("div", {"class": "_30jeq3"}) oldPricesAll = PageSoup.find_all("div", {"class": "_3I9_wc"}) discountsAll = PageSoup.find_all("div", {"class": "_3Ay6Sb"}) imageLinksAll = PageSoup.find_all("div", {"class": "_4ddWXP"}) #containers = PageSoup.find_all("div") #print(containers) #print("###########\n") # print("Container Length :" + str(len(containers)))
filename = "playersFutbin.csv" f = open(filename, "w") headers = "player, rating, price\n" f.write(headers) i = 1 while i < 3: my_url = 'https://www.futbin.com/19/players?page=' + str(i) pgdownload = Request(my_url, headers={'User-Agent': 'Mozilla/5.0'}) page_html = uOpen(pgdownload).read() uOpen(pgdownload).close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("tr", {"class": "player_tr_1"}) containers2 = page_soup.findAll("tr", {"class": "player_tr_2"}) containers.extend(containers2) for container in containers: player_name = container.find("a", {"class": "player_name_players_table"}).text if container.find("span", {"class": "form rating ut19 icon gold rare"}) is not None: rating = container.find("span", {"class": "form rating ut19 icon gold rare"}).text elif container.find("span", {"class": "form rating ut19 gold rare"}) is not None:
__author__ = "Laurence Elliott" from urllib.request import urlopen as uOpen from bs4 import BeautifulSoup as soup import re import os myUrl = "https://www.freewarefiles.com/search.php?categoryid=1&query=&boolean=exact" # connecting to and downloading page uClient = uOpen(myUrl) page_html = uClient.read() uClient.close() # instatiating BeautifulSoup parsing of first page page_soup = soup(page_html, "html.parser") # gets page numbers from list above program listings numPagesA = page_soup.findAll("li", {"class": "page-item"}) numPagesArr = [] for numPageA in numPagesA: numPage = numPageA.findAll("a", {"class": "page-link"})[0] try: numPage = re.search('(?<=>)[0-9]+(?=<\/a>)', str(numPage)).group(0) numPagesArr.append(numPage) except: pass # the last of the list of page numbers is stored for reference as the last # page of the search maxPage = numPagesArr[-1]
def request_web(self): request = uOpen(self.url) html_file = request.read() return html_file
# -------------- refs_url = 'http://www.livefutbol.com/arbitro/esp-primera-division-' url_refs = list() arbitros_df = pd.DataFrame(columns=['Temporada', 'Nombre', 'Partidos', 'Amarillas', 'Rojas']) for año in range(2017, 2010, -1): temp = str(año) + '-' + str(año+1) string = str(año) + '-' + str(año+1) + '/1/' url_refs.append(refs_url + string) # url_refs[1] = 'http://www.livefutbol.com/arbitro/esp-primera-division-2016-2017_2/1/' for url_ref in url_refs: try: refs_page = BS(uOpen(url_ref).read(), 'html.parser') tabla = refs_page.find('table', {'class': 'standard_tabelle'}) filas = tabla.findAll('tr')[1:-1] for fila in filas: #fila = filas[0] temporada = url_ref[-12:-3] datos = fila.findAll('td') nombre = datos[0].text partidos = int(datos[4].text) amarillas = int(datos[5].text) rojas = datos[6].text if rojas == '-': rojas = int(0) else: rojas = int(rojas)
#ctrl shift p to open command console #set syntax = python from urllib.request import urlopen as uOpen from bs4 import BeautifulSoup as soup #get the url you want to use my_url = 'https://www.amazon.co.uk/s/ref=nb_sb_noss_1/262-5127199-8693620?url=search-alias%3Daps&field-keywords=apple+juice' #grab the webpage using the urlopen function, create a file variable and put the contents there uFile = uOpen(my_url) #create a text variable and read the html page into it html_page = uFile.read() #close the connection uFile.close() #using the beautiful soup function, parse the html page and pass it into a variable #with the 'html.parser' argument, you tell the function how to parse the html page parsed_page = soup(html_page, 'html.parser') #test #print(parsed_page.h1) #print(parsed_page.p) #now its time to traverse the html and convert desired items into a csv file #use the findAll method to grab all the html elements you want and put them in a list #syntax is list = parsed_page.findAll('htmlelement', {'attributename':'attributevalue'})
from urllib.request import urlopen as uOpen if os.path.exists(root): path_to_data = os.path.join(root, 'Datos/Scrapped') path_to_save = os.path.join(root, 'Datos/Created') # IMPORT HELPER FUNCTIONS from Scrapping.utils import limpiar_nombre, buscar_equivalencia, url_to_image # GLOBAL AND SEASONAL MODELS ###################################################### # Open connection, grab the web content and download it # ----------------------------------------------------- m_url = 'http://www.marca.com/futbol/primera/equipos.html' client = uOpen(m_url) page = client.read() client.close() page_soup = BS(page, 'html.parser') equipos = page_soup.findAll('li', {'id': 'nombreEquipo'}) print('Tenemos %d equipos' % len(equipos)) teams = list() equipos_df = pd.DataFrame(columns=['Nombre', 'Escudo', 'Es_url']) jugadores = list() jugadores_df = pd.DataFrame(columns=['Equipo', 'Jugador', 'Dorsal']) for equipo in equipos: