Python urlopen Examples, urllib.request.urlopen Python Examples

Example #1

0

Show file

File: football_bet.py Project: JTMachen/Sponsio-Computatum

def football_bet():
    # Ensure that the football season is currently going on
    year_date = datetime.now().strftime('%Y-%m-%d')
    if year_date > 'February 2 2020' and year_date < 'September 10 2020':
        print(
            "The next football season hasn't begun yet. Please come back on September 10."
        )
        return
    elif year_date < 'February 2 2020':
        url = 'https://www.pro-football-reference.com/years/2019/games.htm'
    else:
        url = 'https://www.pro-football-reference.com/years/2020/games.htm'
    # Run through BeautifulSoup steps to pull wanted data
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    teams_win_loss = html.findAll(class_='left')
    game = html.findAll(class_='right')
    game = [team.get_text() for team in game]
    teams_win_loss = [team.get_text() for team in teams_win_loss]
    removal = ['Day']
    teams_win_loss = [item for item in teams_win_loss if item not in removal]
    # Set todays date that will be used to select todays games
    date = datetime.now().strftime('%B %d')
    # Clean stats
    bin_len = 8
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game) + 1):
        week = game[start:end]
        start = end + 1
        end = start + bin_len
        week_list.append(week)
    df_1 = pd.DataFrame(week_list)
    df_1.columns = [
        'Game_Week', 'Time (EST)', 'Stat1', 'Stat2', 'Stat3', 'Stat4', 'Stat5',
        'Stat6'
    ]

    bin_len = 4
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(teams_win_loss) + 1):
        week = teams_win_loss[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df_2 = pd.DataFrame(week_list)
    df_2.columns = ['Day_Of_Week', 'Date', 'Home', 'Visitor']
    # Concat data frames
    football = pd.concat(
        [df_1[['Game_Week', 'Time (EST)']], df_2[['Date', 'Home', 'Visitor']]],
        axis=1,
        join='inner')
    # Select only games being played today
    todays_games = football[football['Date'] == date]
    # Return dataframe
    return todays_games

Example #2

0

Show file

from urllib.request import urlopen as Req
from bs4 import BeautifulSoup as soup

f = open('data.csv', 'w')

url = 'https://www.dicksmith.co.nz/dn/shop/phones/iphone/?page=1'

Client = Req(url)
page_html = Client.read()
Client.close()

page_soup = soup(page_html, 'html.parser')

containers = page_soup.find_all('div', class_='_1umis')

f.write('Phone, Rating, Price \n')

for container in containers:
    Phone = container.find('a', itemprop='url').text
    if container.find('meta', itemprop='ratingValue') == None:
        Rating = 'no rating'
    else:
        Rating = container.find('meta', itemprop='ratingValue')['content']
    Price = container.find('span', itemprop='price')['content']
    f.write(Phone.replace(',', ' |') + ', ' + Rating + ', ' + Price + "\n")
    print(
        Phone.replace(',', ' |') + ', ' + Rating + ',' +
        Price.replace(',', '') + "\n")

f.close()

Example #3

0

Show file

File: benjaminWebscraping.py Project: JacobJabs/PythonAssignments4Sem

from bs4 import BeautifulSoup as soup
import requests
from urllib.request import urlopen as UReq

# spørgsmål 1: Hvor mange produkter kommer frem, når man søger på "breaking benjamin" (se URL'en)

my_url = 'https://www.merchbar.com/search?q=breaking%20benjamin&p=1'

uClient = UReq(my_url)
page_html = uClient.read()
uClient.close()

page_soup = soup(page_html, 'html.parser')
# print(page_soup.title)

# grabs each product
containers = page_soup.findAll("div", {"class": "col-md-4 col-6"})
print(len(containers))
# 20 er svaret
divWithInfo = containers[0].find("div", "MerchTile.module__brandName")

Example #4

0

Show file

from urllib.request import urlopen as UReq
from bs4 import BeautifulSoup as Soup
from selenium import webdriver

#The Website
driver = webdriver.Firefox()
driver.get(
    "https://dubai.dubizzle.com/en/property-for-sale/residential/?filters=(listed_by.value%3A%22LA%22)"
)
my_url = (
    "https://dubai.dubizzle.com/en/property-for-sale/residential/?filters=(listed_by.value%3A%22LA%22)"
)
# ----------------------------------------------------------------------------------------------------------------------
# Uclient downloads the Url which is stored in the variable my_url
Uclient = UReq(my_url)
# This reads my HTML which has been downloaded
Html = Uclient.read()
# Closes the HTML to prevent the console from crashing
Uclient.close()
# -----------------------------------------------------------------------------------------------------------------------
# Parses the HTML
Page_soup = Soup(Html, "html.parser")
# Grabs each product
# mobile = driver.find_elements_by_xpath('//span[@class="call-modal__phone_number"]')
modals = driver.find_elements_by_xpath('//*[@data-testid="lpv-call-button"]')
containers = Page_soup.findAll("div",
                               {"class": "ListItem__Root-sc-1i3osc0-1 hMPXKC"})

# Creating the file, the headers and the name of the file
filename = "properties.csv"
f = open(filename, "w")

Example #5

0

Show file

File: Homedy.py Project: baobaoack000/WebScrapping-homedy.com-Jupyter-

def GetUrl(url):
    Uclient = Req(url)
    page_html = Uclient.read()
    Uclient.close()
    page_soup = Soup(page_html, "html.parser")
    return page_soup

Example #6

0

Show file

 def getPage(self):
     uClient = UReq(self.url)
     self.page_html = uClient.read()
     uClient.close()

Example #7

0

Show file

from urllib.request import urlopen as URL
import bs4
from bs4 import BeautifulSoup as BS

#Created a tuple for which need to scrape



GetURL = ("https://www.moneycontrol.com/india/stockpricequote/computers-software/infosys/IT",
"https://www.moneycontrol.com/india/stockpricequote/computers-software/tataconsultancyservices/TCS")
#Blank List to store values
printlist = []
for eachURL in GetURL:
    
    html = URL(eachURL)
    HTML=html.read()

    GetHTMLData = BS(HTML, 'lxml')

    title = GetHTMLData.title
#print(title)
    text = title.get_text()
    printlist.append(text)

    rows = GetHTMLData.find_all('span')

    for row  in rows:
            Price_Item = row.get("id")
    
            if (Price_Item == "Bse_Prc_tick") or (Price_Item =="Nse_Prc_tick"):

Example #8

0

Show file

#!/usr/bin/env python3

#import Python libraries regex, statistics, urllib.request and Beautiful Soup
import locale
import re
from statistics import mean
from urllib.request import urlopen as Req
from bs4 import BeautifulSoup as soup

# Declare my_url variable

my_url = "https://www.carfax.com/Used-Honda-Civic-Type-R_t10063"

# Load my_url contents into Scrapee variable

Scrapee = Req(my_url)

# Extract html to variable Scrapee_html

Scrapee_html = Scrapee.read()

# Close web page

Scrapee.close()

# Parse html into node tree and strip html tags, store as variable Scrapee_soup

Scrapee_soup = soup(Scrapee_html, "html.parser")

#Find matching class data and store into three variables

Example #9

0

Show file

def hockey_win(date):
    url = 'https://www.hockey-reference.com/leagues/NHL_2020_games.html'
    # Run through BeautifulSoup steps
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    game = html.findAll(class_ = 'left')
    results = html.findAll(class_ = 'right')
    game = [team.get_text() for team in game]
    results = [team.get_text() for team in results]
    results_drop = ['LOG']
    results = [results for results in results if results not in results_drop]
    drop_list = ['Date','Visitor','Home','Notes','']
    # Clean data
    game = [game for game in game if game not in drop_list]
    bin_len = 3
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game) + 1):
        week = game[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Date','Visitor','Home']
    # Clean team names into readable format
    bin_len = 4
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(results) + 1):
        week = results[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df_1 = pd.DataFrame(week_list)
    row_count = 0
    visitor = df['Visitor'].str.split(" ", expand = True) 
    home = df['Home'].str.split(" ", expand = True) 
    while row_count < len(df):
        if visitor[2][row_count] == None:
            df['Visitor'][row_count] = visitor[1][row_count]
        elif visitor[2][row_count] != None:
            df['Visitor'][row_count] = visitor[2][row_count]
        if home[2][row_count] == None:
            df['Home'][row_count] = home[1][row_count]
        elif home[2][row_count] != None:
            df['Home'][row_count] = home[2][row_count]
        row_count += 1
    # Only select todays games
    df_1.columns = ['Visitor_Goals','Home_Goals','Attendance','Time']
    total_df = pd.concat([df,df_1],axis=1,join='inner')
    win_count = 0
    win_list = []
    while win_count < len(total_df):
        if (total_df['Visitor_Goals'][win_count]) > (total_df['Home_Goals'][win_count]):
            win_list.append(total_df['Visitor'][win_count])
        elif (total_df['Home_Goals'][win_count]) > (total_df['Visitor_Goals'][win_count]):
            win_list.append(total_df['Home'][win_count])
        elif (total_df['Home_Goals'][win_count]) != '' and (total_df['Visitor_Goals'][win_count]) != '' and (total_df['Home_Goals'][win_count]) == (total_df['Visitor_Goals'][win_count]):
            win_list.append('Tie')
        else:
            win_list.append('Incomplete') 
        win_count += 1
    total_df['Winner'] = win_list
    todays_games = total_df[total_df['Date'] == date]
    todays_games = todays_games.reset_index()
    return todays_games

Example #10

0

Show file

File: scrapetest.py Project: Coslate/Crawler

#! /usr/bin/env python3.6
from urllib.request import urlopen as Uop
from bs4 import BeautifulSoup as BS4Soup

html = Uop("http://pythonscraping.com/pages/page1.html")
bs_obj = BS4Soup(html.read(), "html.parser")

print("bs_obj.html.body.h1 = ", bs_obj.html.body.h1)
print("bs_obj.html.h1 = ", bs_obj.html.h1)
print("bs_obj.body.h1 = ", bs_obj.body.h1)
print("bs_obj.h1 = ", bs_obj.h1)
#print(bs_obj)
#print(html.read())

#bs_obj_local_html = BS4Soup("./simple_prac.html", "html.parser")
#print("bs_obj_local_html.h2 = ", bs_obj_local_html.h2)

Example #11

0

Show file

File: baseball_bet.py Project: JTMachen/Sponsio-Computatum

def baseball_bet():
    # Set the current date in a readable form and the form used for the html
    todays_date = datetime.now().strftime('%m-%d-%Y')
    date_html = datetime.now().strftime('%Y%m%d')
    # Set Opening Day date
    openeing_day = "03-26-2020"
    # Parse OD date
    OD = datetime.strptime(openeing_day, "%m-%d-%Y")
    # Set current date
    present = datetime.now()
    # If it is before OD, return from function
    if present.date() < OP.date():
        print('Opening Day is not until March 26. Please come back then.')
        return
    # Set url for todays date if season has already started
    url = 'https://www.espn.com/mlb/schedule/_/date/' + date_html
    # Make sure that there are acutally games being played
    # If there are not, the url will not work
    try:
        uClient = Ureq(url)
        raw_content = uClient.read()
    except:
        print('There are no games being played on this day.')
        return
    # Run through BeautifulSoup steps to pull out desired data
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    game = html.findAll(class_='external')
    game_date_list = []
    # Fix dates given into readable datetime format
    for x in range(1, len(game)):
        game_date = game[x]['href'].split('/')[5].split('-')[-3:-1]
        game_date.append('2020')
        sent_str = ""
        for i in game_date:
            sent_str += str(i) + "-"
        sent_str = sent_str[:-1]
        date = datetime.strptime(sent_str, '%m-%d-%Y')
        date = date.strftime('%m-%d-%Y')
        game_date_list.append(date)
    # Get the names of the teams that are playing on that day
    game = html.findAll(class_='team-name')
    game = [team.get_text() for team in game]
    game_list = []
    for item in game:
        # The abbrvs are only the last three characters in the str
        item = item[-3:]
        game_list.append(item)
    # Split home and away teams from the list of cleaned teams
    bin_len = 2
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game_list) + 1):
        week = game_list[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Visitor', 'Home']
    df['Date'] = game_date_list
    todays_games = df[df['Date'] == todays_date]
    # Apply the lambda function that will clean the team names into more colloquial names
    todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x])
    todays_games['Visitor'] = todays_games['Visitor'].apply(
        lambda x: teams_dict[x])
    # return data frame of games that are being played today
    return todays_games

Example #12

0

Show file

File: copy_of_copy_of_copy_of_copy_of_info5731_assignment_two.py Project: sa0753/sangeeta

(1) Collect all the customer reviews of the product [2019 Dell labtop](https://www.amazon.com/Dell-Inspiron-5000-5570-Laptop/dp/B07N49F51N/ref=sr_1_11?crid=1IJ7UWF2F4GHH&keywords=dell%2Bxps%2B15&qid=1580173569& sprefix=dell%2Caps%2C181&sr=8-11&th=1) on amazon.

(2) Collect the top 100 User Reviews of the film [Joker](https://www.imdb.com/title/tt7286456/reviews?ref_=tt_urv) from IMDB.

(3) Collect the abstracts of the top 100 research papers by using the query [natural language processing](https://citeseerx.ist.psu.edu/search?q=natural+language+processing&submit.x=0&submit.y=0&sort=rlv&t=doc) from CiteSeerX.

(4) Collect the top 100 tweets by using hashtag ["#wuhancoronovirus"](https://twitter.com/hashtag/wuhancoronovirus) from Twitter.
"""

from urllib.request import urlopen as UReq
from bs4 import BeautifulSoup as soup
import csv
import pandas as pd

my_url = 'https://www.imdb.com/title/tt7286456/reviews?ref_=tt_urv'
uClient = UReq(my_url)
page_html = uClient.read()
uClient.close

page_soup = soup(page_html, "html.parser")

with open('joker.csv', 'w', newline='') as file:
    file_input = csv.writer(file)
    file_input.writerow(["Number", "User Name", "Description"])
    for x in range(0, 100):
        name = page_soup.findAll("span", {"class": "display-name-link"})
        name_id = name[0].text

        review = page_soup.findAll("div", {"class": "review-container"})
        reviews = review[0].text

Example #13

0

Show file

    def scrape():
        ####################################################################################
        concat = Sentry.get()
        #my_url = "file:///C:/Users/Adam-22-26/Desktop/graphics%20card%20-%20Newegg.com.html"
        my_url = 'https://www.newegg.com/global/ph-en/p/pl?d={}'.format(concat)
        my_url = my_url.replace(' ', '+')
        ####################################################################################
        uClient = Ureq(my_url)

        page_html = uClient.read()
        uClient.close()
        #html_parsing
        page_soup = Soup(page_html, "html.parser")
        #grabe each
        containers = page_soup.findAll("div", {"class": "item-container"})

        #manufacturer = page_soup.findAll("label",{"class": "form-checkbox"})
        #print(manufacturer )
        #print(len(containers))
        #print(containers[5:])
        #container = containers[5]
        #---------------------------------------- save the csv files
        fileName = "Online_Sales.csv"
        f = open(fileName, "w")
        headers = "BRAND     , PRICES    ,  SAVES    , TITLES   , LINK    \n"  #
        f.write(headers)

        for container in containers[4:25]:
            #---------------------------------------------------------
            brand_container = container.findAll("a", {"class": "item-brand"})
            brand = brand_container[0].img["title"]  #brand name

            #-------------------------------------------------------------------
            may_know = container.findAll("a", {"class": "item-title"})
            #print(may_know)

            ####################################################################
            title = container.a.img["title"]  #Name of selling
            #print(container)
            #######################################################3
            hyper = brand_container[0]["href"]
            #hyper = container.findAll("div",{"class": "item-info"})
            #hyper = hypers.a
            #print(hyper)
            #--------------------------------------------------------------
            price_container = container.findAll("li",
                                                {"class": "price-current"})
            price_container2 = price_container[0].strong
            price = re.findall(r'.\d.\d\d\d', str(price_container2))
            prices = ''.join(price)
            #------------------------------------------------------------------------
            save_container = container.findAll("span",
                                               {"class": "price-save-percent"})
            save = re.findall(r'\d\d.', str(save_container))
            saves = ''.join(save)
            if saves == '':
                saves = "None"
            else:
                saves = saves
            if prices == "":
                prices = "Not Available"
            else:
                prices = prices
            #-------------------------------------------------------------------------

            f.write(
                brand.replace(',', '') + ", " + prices.replace(
                    ',', '.').replace('0', '1').replace('>', '    ') + ',' +
                saves.replace('', '').replace('None', '0%') + ', ' +
                title.replace(',', '') + ', ' + hyper + "\n")

        f.close()
        new_win = Button(window,
                         width=10,
                         text="New_Win",
                         command=mainwindow,
                         height=1,
                         font="Jokerman",
                         relief=RAISED,
                         activebackground="LightBlue1",
                         background='sky blue')
        new_win.place(x=105, y=90)
        messagebox.showinfo("Happens", "DONE! \n press ok to proceed")

Example #14

0

Show file

File: main_newegg_file.py Project: thenoahkhan/NEWEGG-DATA-SCRAPER

from urllib.request import urlopen as UR
from bs4 import BeautifulSoup as soup
import re

URL_SIMP = 'https://www.newegg.com/Xbox-One-Systems/SubCategory/ID-3216'
URL_CLIENT = UR(URL_SIMP)
PAGE = URL_CLIENT.read()
URL_CLIENT.close()
PAGE_SOUP = soup(PAGE, "html.parser")
PAGE_CONTENT = PAGE_SOUP.findAll("div", {"class": "item-container"})

filename = "newegg.csv"
f = open(filename, "w")
headers = ("PRICE, SHIP, NAME\n")
f.write(headers)

for CONTENT in PAGE_CONTENT:

    PRICE_DATA = CONTENT.findAll("li", {"class": "price-current"})
    PRICE = PRICE_DATA[0].text
    PRICE_SIMP = re.sub("[^\d\.]", "", PRICE)

    SHIP_CONTENT = CONTENT.findAll("li", {"class": "price-ship"})
    SHIP = SHIP_CONTENT[0].text.strip()

    NAME = CONTENT.img["title"]

    print("Price: " + "$" + PRICE_SIMP)
    print("Shipping Cost: " + SHIP)
    print("Name of Product: " + NAME)
    print("\n")

Example #15

0

Show file

import pandas as pd
from urllib.request import urlopen as Req
from bs4 import BeautifulSoup as Soup

my_url = 'https://batdongsan.com.vn/nha-dat-can-thue'

#graping the page
Uclient = Req(my_url)
page_html = Uclient.read()
Uclient.close()

#checking file and translate (parser)
page_soup = Soup(page_html, "html.parser")

#selecting
containers = page_soup.findAll("div", {"class": "branch"})

text
text1
final = []
final1 = []
final3 = []
for i in range(len(containers)):
    store = containers[i].findAll("div", {"class": "branch-name"})
    text = store[0].text
    print("Title : " + text)
    store1 = containers[i].findAll("div", {"class": "branch-add"})
    text1 = store1[0].text
    print("Info : " + text1)
    final.append(text)
    final1.append(text1)

Example #16

0

Show file

File: TELEGRAM CODE.py Project: kitrakatpug21/telegram_api

             Run the script and connect "@kitrak_bot" on telegram app
             Send message like "Google <something you want to search>
                              eg. Google Candybar
		    The webpage will be opened in your laptop 
	          Send "Close" , Chrome browser will be killed.
"""
from urllib.request import urlopen as URL
import urllib.error as err
import webbrowser
import os
my_api = "647242931:AAG7wgAy4Fn-IWrgKn0RskeSnrowZcv6AOc"
temp = ""
while True:
    open_URL = 'https://api.telegram.org/bot' + my_api + '/getupdates'
    try:
        client_URL = URL(open_URL)
        data_URL = client_URL.read().decode('utf-8')
        client_URL.close()
        getstring = data_URL.split("{")
        getdata = getstring[-1].split("\"")
        data = getdata[-2]
        if (data == temp):
            pass
        else:
            print("data received:", data)
            temp = data
            if (data[0:6] == "google" or data[0:6] == "Google"):
                url = "http://www.google.co.in/search?q=" + data[6:]
                webbrowser.open(url)
            elif (data[0:5] == "close" or data[0:6] == "Close"):
                print("Closed")