Python uOpen Examples, urllib.request.uOpen Python Examples

Example #1

0

Show file

File: 0_web_scrapper_2010_2017.py Project: hfsyung/web_scrapping

def crear_calendario(temp, path):
    
    global partidos_df
    page_soup = BS(uOpen(path).read(), 'html.parser')
    jornadas = page_soup.find_all('div',{'class': 'jornada-calendario-historico'})
    
    for jornada in jornadas:

        numero_jornada_y_fecha = jornada.div.text
        jor, fecha = filtrar_jornada(numero_jornada_y_fecha)
        partidos = jornada.findAll('td')
        
        for j, partido in enumerate(partidos):
            
            resultado = partido.text
            eq_loc, eq_vis, gol_loc, gol_vis = filtrar_resultado(resultado)
            eq_loc, eq_vis = limpiar_nombre(eq_loc), limpiar_nombre(eq_vis)        
            eq_loc, eq_vis = buscar_equivalencia(eq_loc), buscar_equivalencia(eq_vis)
            
            res = pd.DataFrame([[temp, jor, j+1, fecha, eq_loc, eq_vis, gol_loc, gol_vis]], 
                               columns=list(partidos_df))
            partidos_df = partidos_df.append(res)
        
    partidos_df = partidos_df.reset_index()
    partidos_df.drop('index', axis=1, inplace=True)

Example #2

0

Show file

def crear_calendario(temp, path):

    global partidos_df
    page_soup = BS(uOpen(path).read(), 'html.parser')
    rounds = page_soup.find_all('div',
                                {'class': 'jornada calendarioInternacional'})

    for r in rounds:

        rnd = r.caption.text  # Get the name of the round i.e. Jornada 1
        matches = r.findAll('tr')  # Find all the matches in that round

        for j, match in enumerate(matches[1:]):

            loc = match.find('td', {'class': 'local'}).span.text
            away = match.find('td', {'class': 'visitante'}).span.text

            loc, away = limpiar_nombre(loc), limpiar_nombre(away)
            loc, away = buscar_equivalencia(loc), buscar_equivalencia(away)

            res = pd.DataFrame([[rnd, j + 1, loc, away]],
                               columns=list(partidos_df))
            partidos_df = partidos_df.append(res)

    partidos_df = partidos_df.reset_index()
    partidos_df.drop('index', axis=1, inplace=True)

Example #3

0

Show file

File: page.py Project: Yong-L/Tax-Bracket

def read_page(page_url):
    #Request page
    client = uOpen(page_url)
    page_html = client.read()
    client.close()

    return page_html

Example #4

0

Show file

File: pokemon.py Project: jansen44/pokedex-scrap

def get_poke_soup(link):
    uClient = uReq(link, headers={'User-Agent': 'Magic Browser'})
    uCon = uOpen(uClient)
    poke_page_html = uCon.read()
    uCon.close()

    return soup(poke_page_html, 'html.parser')

Example #5

0

Show file

File: utils.py Project: hfsyung/web_scrapping

def url_to_image(url):
    '''
    Función para extraer una imagen de una URL
    '''
    resp = uOpen(url)
    image = np.asarray(bytearray(resp.read()), dtype='uint8')
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    return image

Example #6

0

Show file

File: news_scrapper.py Project: iseakash/newsscrapper

    def news_titl(self):
        header = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        }
        news_url = 'https://economictimes.indiatimes.com/'
        uClient = uOpen(Request(
            news_url,
            headers=header))  # requesting the webpage from the internet
        newsPage = uClient.read()  # reading the webpage
        uClient.close()  # closing the connection to the web server
        news_html = bs(newsPage, "html.parser")  # parsing the webpage as HTML
        bigboxes = news_html.findAll(
            'ul', {"class": "newsList clearfix"
                   })  # searching for appropriate tag to get news titles
        box = bigboxes[0].contents  # taking the first iteration (for demo)
        del box[10:]  # deleting news more than 10 counts
        news = []  # initializing an empty list for news title

        for b in box:
            topicLink = "https://economictimes.indiatimes.com/" + b.a[
                'href']  # extracting the actual product link
            topicRes = uOpen(Request(
                topicLink,
                headers=header))  # getting the product page from server

            topic_html = bs(topicRes,
                            "html.parser")  # parsing the product page as HTML
            title_content = topic_html.findAll(
                'div', {"class": "topPart clearfix tac fixedOnLoad"
                        })  # searching for appropriate tag to get news titles
            body_content = topic_html.findAll(
                'div', {"class": "artSyn bgPink"
                        })  # searching for appropriate tag to get news article

            title = title_content[0].h1.text
            content = body_content[0].h2.text

            my_dict = {"Title": title, "Article": content}
            # fns = main_functions()
            # fns.store_raw_news(collection = collection, db_name = db_name, json = my_dict)
            news.append(my_dict)
        return news

Example #7

0

Show file

    def get_item_data(self, item_data):
        '''Get item data'''
        url = 'https://eu.api.battle.net/d3/data/item/{}?locale={}&apikey={}'.format(
            item_data, self.LOCALE, self.API_KEY)
        uClient = uOpen(url)
        output = uClient.read()
        uClient.close()
        parsed_output = ujson.loads(output)

        return parsed_output

Example #8

0

Show file

    def get_hero_profile(self, battleTag, heroID):
        '''Get hero profile'''
        battleTag = str(battleTag).replace('#', '%23')
        url = 'https://eu.api.battle.net/d3/profile/{}/hero/{}?{}&apikey={}'.format(
            battleTag, heroID, self.LOCALE, self.API_KEY)
        uClient = uOpen(url)
        output = uClient.read()
        uClient.close()
        parsed_output = ujson.loads(output)

        return parsed_output

Example #9

0

Show file

File: app.py Project: iseakash/news_title

def index():
    if request.method == 'GET':
        try:
            searchString = "news_titles"
            dbConn = pymongo.MongoClient("mongodb://localhost:27017/")  # opening a connection to Mongo
            db = dbConn['newscrawlerDB'] # connecting to the database called crawlerDB
            news = db[searchString].find({}) # searching the collection with the name same as the keyword
            if news.count() > 0:  # if there is a collection with searched keyword and it has records in it
                return render_template('results.html', news=news)  # show the results to user
            else:
                header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
                news_url = 'https://economictimes.indiatimes.com/'
                uClient = uOpen(Request(news_url, headers = header)) # requesting the webpage from the internet
                newsPage = uClient.read() # reading the webpage
                uClient.close() # closing the connection to the web server
                news_html = bs(newsPage, "html.parser") # parsing the webpage as HTML
                bigboxes = news_html.findAll('ul', {"class": "newsList clearfix"}) # searching for appropriate tag to get news titles
                box = bigboxes[0].contents  # taking the first iteration (for demo)
                del box[10:] # deleting news more than 10 counts
                news = [] # initializing an empty list for news title

                table = db[searchString] # creating a collection with the same name as search string. Tables and Collections are analogous.

                for b in box:
                    topicLink = "https://economictimes.indiatimes.com/" + b.a['href']  # extracting the actual product link
                    topicRes = uOpen(Request(topicLink, headers = header))  # getting the product page from server

                    topic_html = bs(topicRes, "html.parser")  # parsing the product page as HTML
                    title_content = topic_html.findAll('div', {"class": "topPart clearfix tac"})  # searching for appropriate tag to get news titles
                    body_content = topic_html.findAll('div', {"class": "artSyn bgPink"})  # searching for appropriate tag to get news titles

                    title = title_content[0].h1.text
                    content = body_content[0].h2.text

                    my_dict = {"Title": title, "Article": content}
                    x = table.insert_one(my_dict)  # insertig the dictionary containing the news comments to the collection
                    news.append(my_dict)

            return render_template('results.html', news=news)
        except:
            return 'something is wrong'

Example #10

0

Show file

def getPokeImage(pokeURL):
    # getting the basename of the pokemon
    pokeBaseName = pokeURL.split('/')[-1] #Charizard is the pokeBaseName for Charizard and the two Mega forms Mega Charizard X and Mega Charizard Y
    # open the URL for that pokemon and read in the html
    pokemon = uOpen(pokeURL)
    pokePage = pokemon.read()
    pokeSoup = soup(pokePage, 'html.parser')
    # pictures are linked to in divs of class profile-images
    pokeProfile = pokeSoup.findAll('div',{'class':'profile-images'})[0]
    pokeImages = pokeProfile.findAll('img')
    # the 'alt' attribute stores the names of the pokemons
    pokeNames = [pokeImage['alt'] for pokeImage in pokeImages]
    # the 'src' attribute stores the links to the pictures of the pokemons
    pokePicLinks = [pokeImage['src'] for pokeImage in pokeImages]
    pokePicTuple = list(zip(pokePicLinks, pokeNames))
    # given the name and the link of a pokemon, uRetrieve can download its picture
    for pokePicLink,pokeName in pokePicTuple:
        # sometimes the 'alt' attribute does have the basename of the pokemon, so need to add it in to the name of the png file.
        if pokeBaseName not in pokeName:
            pokePic = uRetrieve(pokePicLink, pokeBaseName + ' ' + pokeName + '.png')
        else:
            pokePic = uRetrieve(pokePicLink, pokeName + '.png')

Example #11

0

Show file

from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uOpen

filename = "pc_games2.csv"
file = open(filename, "w")
file.close()
file = open(filename, "a+")
headers = "name,company,price,save\n"
file.write(headers)
for i in range(1, 11):
    myUrl = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=100007756&IsNodeId=1&Description=pc%20games&page={}&bop=And&PageSize=36&order=BESTMATCH'.format(
        i)
    Client = uOpen(myUrl)
    page = Client.read()
    Client.close()
    html_page = soup(page, "html.parser")
    containers = html_page.findAll("div", {"class": "item-container"})

    for container in containers:
        name = container.a.img["title"]
        b = (container.find("div", "item-action").ul.li.text).strip()
        old_price = b[0:6]
        c = (container.find("div", "item-action").ul).find(
            "li", "price-current").text.strip()
        new_price = c[0:6]
        #save=(container.find("div","item-action").ul).find("li","price-save").find("span","price-save-percent").text.strip()
        print("\n")
        print("name          " + name)
        try:
            company = container.find("div", "item-info").img["title"]
        except:

Example #12

0

Show file

from urllib.request import urlopen as uOpen, Request as uReq
from bs4 import BeautifulSoup as soup, NavigableString

uClient         = uReq('https://bulbapedia.bulbagarden.net/w/index.php?title=Category:Pok%C3%A9mon_that_are_part_of_a_three-stage_evolutionary_line&pagefrom=Raichu+%28Pok%C3%A9mon%29#mw-pages', headers={'User-Agent': 'Magic Browser'})
uCon            = uOpen(uClient, None, 5)
poke_page_html  = uCon.read()
uCon.close()

ps = soup(poke_page_html, 'html.parser')
ps = ps.find(attrs={'id': 'mw-content-text'}).find('div', attrs={'class': 'mw-category'})


with open('d.txt', 'a') as f:
    ps = ps.find_all('a')
    ps = [p.text.replace(' (Pokémon)', '\n') for p in ps]
    for p in ps:
        f.write(p)

Example #13

0

Show file

File: core.py Project: SuicideSin/SteamScrape3

def connect(url):
    '''Connect to url and return html source as string'''
    uClient = uOpen(url)
    html_source = uClient.read()
    uClient.close()
    return html_source

Example #14

0

Show file

import re
import os

# associates the windows clear terminal command with a simpler name
clear = lambda: os.system('cls')

# this script scrapes the list of all steam products (including bundles, games,
# videos, music, and software)
# at 'https://store.steampowered.com/search/?sort_by=Name_ASC',
# and calculates the current mean user rating of all the products
# on the steam platform.

my_url = "https://store.steampowered.com/search/?sort_by=Name_ASC"

# opening connection and downloading the page
uClient = uOpen(my_url)
page_html = uClient.read()
uClient.close()

# instantiating html parser
page_soup = soup(page_html, "html.parser")

# gets all steam games on the page
games = page_soup.findAll("a", {"class": "search_result_row"})

# creates an empty array which is later populated with values for product review scores
reviewScores = []

# creates an integer value for the last catalogue page number
lastPageNum = 0
pageNumTags = page_soup.findAll(

Example #15

0

Show file

                        # print("vignvig")
                        StopUpdate = "y"
                        return [tempNINI, StopUpdate]
                    else:
                        tempNINI.update({str(EpiUID): [SerialUID, EpiImage]})
        except:
            pass

    return [tempNINI, StopUpdate]


# ************************************************
start_time = time.time()

MasterSerialList = uOpen(
    'https://raw.githubusercontent.com/pravanjam/TamilSerialz/master/Master_Serial_List.csv'
)
dataMst = StringIO(MasterSerialList.read().decode('ascii', 'ignore'))
dreader = csv.reader(dataMst)
Serial_Mst = []
SerialMeta = []

for row in dreader:
    Serial_Mst.append(row)

for SerialListID in range(1, len(Serial_Mst)):
    # print(Serial_Mst[SerialListID][1])
    SerialMeta.append({
        'SearchID': Serial_Mst[SerialListID][1],
        'bkURL': Serial_Mst[SerialListID][2],
        'Genre': Serial_Mst[SerialListID][3],

Example #16

0

Show file

def urlParser(my_url, parserType):
    urlHTML = uOpen(my_url)
    page_html = urlHTML.read()
    urlHTML.close()
    PParser = bSoup(page_html, parserType)
    return PParser

Example #17

0

Show file

File: macron_cheese_finder.py Project: SomeMissingColon/Macron-Leaked-email-data-analysis-

def request_web(url):
    request = uOpen(url)
    html_file = request.read()
    return html_file

Example #18

0

Show file

File: 2_web_scrapper_match_details.py Project: hfsyung/web_scrapping

    ju = jugador.replace(minuto, '')
    ju = limpiar_nombre(ju, [], stopwords=matches_stopwords)
    ju = ju.replace('()', '').rstrip()
    return mi, ju

temps = list(url_temporadas.keys())

for temp in temps: 
#temp = temps[0]
    url_jornadas = url_temporadas[temp]
    
    for jor, url in enumerate(url_jornadas):
        
        url = url_jornadas[0]
        
        page = BS(uOpen(url).read(), 'html.parser')
        page.find('main')
        
        partidos = page.find('div', {'class': 'resultados borde-caja'})
        partidos = partidos.find('table')
        partidos = partidos.findAll('tr')    
        
        for n, partido in enumerate(partidos):
         
            eq_loc = partido.find('td', {'class':'equipo-local'}).text
            eq_vis = partido.find('td', {'class':'equipo-visitante'}).text
            
            eq_loc, eq_vis = limpiar_nombre(eq_loc), limpiar_nombre(eq_vis)        
            eq_loc, eq_vis = buscar_equivalencia(eq_loc), buscar_equivalencia(eq_vis)
            
            resultado = partido.find('td', {'class':'resultado'}).text

Example #19

0

Show file

from urllib.request import urlopen as uOpen
from bs4 import BeautifulSoup as soup

target = 'https://www.newegg.ca/Product/ProductList.aspx?Submit=ENE&IsNodeId=1&N=100007708%20600536049%20600536050%20600565061%20600565504%20600565674%20601107975%20601203793%20601204369%20601210955%20601205646%20601202919%20601203927%20601203901%20601294835%20601295933%20601194948%20601296707&cm_sp=Cat_video-Cards_1-_-Visnav-_-Gaming-Video-Cards_2'

#open connection, grab page, and then close connection
Client = uOpen(target)
html = Client.read()
Client.close()

#html parsing
soupy = soup(html, "html.parser")

#grabbing each product
containers = soupy.findAll("div", {"class": "item-container"})

print("Welcome to the NewEgg GPU WebScraper!\n")

for container in containers:
    manufacturer = container.div.div.a.img["title"]

    title_container = container.findAll("a", {"class": "item-title"})
    title = title_container[0].text

    shipping_container = container.findAll("li", {"class": "price-ship"})
    shipping = shipping_container[0].text.strip()

    print("Manufacturer: " + manufacturer)
    print("Title: " + title)
    print("Shipping: " + shipping + "\n")

Example #20

0

Show file

File: PartPickerPrice.py Project: EthanCampana/PC-Part-Picker-Scrapper

    f.close()
except:
    print("Type in PC Part Picker URL you which to price track:")
    url = input()
    f = open(filename, "w")
    headers = "URL, Date_Time, PC_Price, ChangeInPrice\n"
    f.write(headers)

now = datetime.datetime.now()

my_url = url
hdr = {'User-Agent': 'Mozilla/5.0'}
#opening Client
try:
    req = uReq(my_url, headers=hdr)
    uClient = uOpen(req)
    page_html = uClient.read()
    uClient.close()
except:
    f.close()
    print("Could not Open URL... Try a again with different url")
    os.remove(filename)
    quit()

#parses the html page
page_soup = soup(page_html, "html.parser")
#Gets the current Price of the PC Build
Prices = page_soup.findAll("tr", {"class": "total-price part-list-totals"})
price = Prices[0]
buildprice = '"' + price.find("td", {"class": "tr nowrap"}).text + '"'
print("Current PC Build Costs: " + buildprice)

Example #21

0

Show file

    "Provide the full path where CSV reports shall be stored ... :  ")
CSVfile = "Flipkart_INTERACTIVE_%s.CSV" % DateStamp  #Assuming you run from scripts directory
#OutCSV = open(CSVpath + "/" + CSVfile, 'w', newline='')
OutCSV = open(CSVfile, 'w', encoding="utf-8", newline='')
OutWriter = csv.writer(OutCSV)

#print("SlNo.|itemName|rating|price|oldPrice|discount")
print("\n Ouput will be displayed in a moment ... \n")
#OutWriter.writerow("SlNo.|itemName|rating|price|oldPrice|discount")
OutWriter.writerow("IRPOD")
for pg in range(0, (int(pages))):
    URL = (baseURL + "&page=" + str(pg))
    #print("\n\n\n ############# \n Now URL is :    " + URL)
    #URL = 'https://www.flipkart.com/audio-video/pr?sid=0pm&marketplace=FLIPKART&offer=nb:mp:1154f86928,nb:mp:11cc851a28&hpid=u0KJH80uWRAYeEJJpMIZYap7_Hsxr70nj65vMAAFKlc=&fm=neo%2Fmerchandising&iid=M_62ce2069-ba72-4633-a9f3-272c137582ba_2.VLO9AZPF3DJW&ppt=clp&ppn=dotd-store&ssid=m03cg1ws6o0000001609272953413&otracker=clp_omu_infinite_Deals%2Bof%2Bthe%2BDay_2_2.dealCard.OMU_INFINITE_dotd-store_dotd-store_VLO9AZPF3DJW&cid=VLO9AZPF3DJW'
    #URL = 'https://www.flipkart.com/audio-video/pr?sid=0pm&marketplace=FLIPKART&offer=nb%3Amp%3A1154f86928%2Cnb%3Amp%3A11cc851a28&hpid=u0KJH80uWRAYeEJJpMIZYap7_Hsxr70nj65vMAAFKlc%3D&fm=neo%2Fmerchandising&iid=M_62ce2069-ba72-4633-a9f3-272c137582ba_2.VLO9AZPF3DJW&ppt=clp&ppn=dotd-store&ssid=m03cg1ws6o0000001609272953413&otracker=clp_omu_infinite_Deals%2Bof%2Bthe%2BDay_2_2.dealCard.OMU_INFINITE_dotd-store_dotd-store_VLO9AZPF3DJW&cid=VLO9AZPF3DJW&page=2'
    uReq = uOpen(URL)
    HtmlPage = uReq.read()
    uReq.close()

    PageSoup = soup(HtmlPage, "html.parser")
    containers = PageSoup.find_all("div", {"class": "_4ddWXP"})
    ratingsAll = PageSoup.find_all("div", {"class": "_3LWZlK"})
    reviewsAll = PageSoup.find_all("span", {"class": "_2_R_DZ"})
    pricesAll = PageSoup.find_all("div", {"class": "_30jeq3"})
    oldPricesAll = PageSoup.find_all("div", {"class": "_3I9_wc"})
    discountsAll = PageSoup.find_all("div", {"class": "_3Ay6Sb"})
    imageLinksAll = PageSoup.find_all("div", {"class": "_4ddWXP"})
    #containers = PageSoup.find_all("div")
    #print(containers)
    #print("###########\n")
    # print("Container Length     :" + str(len(containers)))

Example #22

0

Show file

filename = "playersFutbin.csv"
f = open(filename, "w")

headers = "player, rating, price\n"

f.write(headers)

i = 1
while i < 3:

	my_url = 'https://www.futbin.com/19/players?page=' + str(i)

	pgdownload = Request(my_url, headers={'User-Agent': 'Mozilla/5.0'})

	page_html = uOpen(pgdownload).read()
	uOpen(pgdownload).close()

	page_soup = soup(page_html, "html.parser")

	containers = page_soup.findAll("tr", {"class": "player_tr_1"})
	containers2 = page_soup.findAll("tr", {"class": "player_tr_2"})
	containers.extend(containers2)

	for container in containers:

		player_name = container.find("a", {"class": "player_name_players_table"}).text

		if container.find("span", {"class": "form rating ut19 icon gold rare"}) is not None:
			rating = container.find("span", {"class": "form rating ut19 icon gold rare"}).text
		elif container.find("span", {"class": "form rating ut19 gold rare"}) is not None:

Example #23

0

Show file

__author__ = "Laurence Elliott"

from urllib.request import urlopen as uOpen
from bs4 import BeautifulSoup as soup
import re
import os

myUrl = "https://www.freewarefiles.com/search.php?categoryid=1&query=&boolean=exact"

# connecting to and downloading page
uClient = uOpen(myUrl)
page_html = uClient.read()
uClient.close()

# instatiating BeautifulSoup parsing of first page
page_soup = soup(page_html, "html.parser")

# gets page numbers from list above program listings
numPagesA = page_soup.findAll("li", {"class": "page-item"})
numPagesArr = []
for numPageA in numPagesA:
    numPage = numPageA.findAll("a", {"class": "page-link"})[0]
    try:
        numPage = re.search('(?<=>)[0-9]+(?=<\/a>)', str(numPage)).group(0)
        numPagesArr.append(numPage)
    except:
        pass

# the last of the list of page numbers is stored for reference as the last
# page of the search
maxPage = numPagesArr[-1]

Example #24

0

Show file

 def request_web(self):
     request = uOpen(self.url)
     html_file = request.read()
     return html_file

Example #25

0

Show file

File: 0_web_scrapper_2010_2017.py Project: hfsyung/web_scrapping

# --------------
refs_url = 'http://www.livefutbol.com/arbitro/esp-primera-division-'
url_refs = list()
arbitros_df = pd.DataFrame(columns=['Temporada', 'Nombre', 'Partidos', 'Amarillas', 'Rojas'])

for año in range(2017, 2010, -1):
    temp = str(año) + '-' + str(año+1)
    string = str(año) + '-' + str(año+1) + '/1/'
    url_refs.append(refs_url + string)

# url_refs[1] = 'http://www.livefutbol.com/arbitro/esp-primera-division-2016-2017_2/1/'

for url_ref in url_refs:
  
    try: 
        refs_page = BS(uOpen(url_ref).read(), 'html.parser')

        tabla = refs_page.find('table', {'class': 'standard_tabelle'})
        filas = tabla.findAll('tr')[1:-1]
        
        for fila in filas:
            
            #fila = filas[0]
            temporada = url_ref[-12:-3]
            datos = fila.findAll('td')
            nombre = datos[0].text    
            partidos = int(datos[4].text)
            amarillas = int(datos[5].text)
            rojas = datos[6].text
            if rojas == '-': rojas = int(0)
            else: rojas = int(rojas)

Example #26

0

Show file

#ctrl shift p to open command console
#set syntax = python

from urllib.request import urlopen as uOpen
from bs4 import BeautifulSoup as soup

#get the url you want to use
my_url = 'https://www.amazon.co.uk/s/ref=nb_sb_noss_1/262-5127199-8693620?url=search-alias%3Daps&field-keywords=apple+juice'

#grab the webpage using the urlopen function, create a file variable and put the contents there
uFile = uOpen(my_url)

#create a text variable and read the html page into it
html_page = uFile.read()

#close the connection
uFile.close()

#using the beautiful soup function, parse the html page and pass it into a variable
#with the 'html.parser' argument, you tell the function how to parse the html page
parsed_page = soup(html_page, 'html.parser')

#test
#print(parsed_page.h1)
#print(parsed_page.p)

#now its time to traverse the html and convert desired items into a csv file

#use the findAll method to grab all the html elements you want and put them in a list

#syntax is list = parsed_page.findAll('htmlelement', {'attributename':'attributevalue'})

Example #27

0

Show file

from urllib.request import urlopen as uOpen

if os.path.exists(root):
    path_to_data = os.path.join(root, 'Datos/Scrapped')
    path_to_save = os.path.join(root, 'Datos/Created')

# IMPORT HELPER FUNCTIONS
from Scrapping.utils import limpiar_nombre, buscar_equivalencia, url_to_image

# GLOBAL AND SEASONAL MODELS
######################################################

# Open connection, grab the web content and download it
# -----------------------------------------------------
m_url = 'http://www.marca.com/futbol/primera/equipos.html'
client = uOpen(m_url)
page = client.read()
client.close()

page_soup = BS(page, 'html.parser')
equipos = page_soup.findAll('li', {'id': 'nombreEquipo'})
print('Tenemos %d equipos' % len(equipos))

teams = list()
equipos_df = pd.DataFrame(columns=['Nombre', 'Escudo', 'Es_url'])

jugadores = list()
jugadores_df = pd.DataFrame(columns=['Equipo', 'Jugador', 'Dorsal'])

for equipo in equipos: