Python urlopen.read Exemples, urllib.request.urlopen.read Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : webscraping_processors.py Projet : roshan-k-patel/webscrape-desktop-processor-data

def mainPageScrape(f):
    address = "https://www.newegg.com/Processors-Desktops/SubCategory/ID-343"

    # opening up connection grabbing the page
    uClient = UReq(address)
    page_html = uClient.read()
    uClient.close()

    # html parsing
    page_soup = soup(page_html, "html.parser")

    # add each processor item container to a list of containers
    containers = page_soup.findAll("div", {"class": "item-container"})

    for container in containers:
        list = (containerScrape(container))
        csv_string = list[0] + "," + list[1] + "," + list[2] + "," + list[3] + "," + list[4] + "," + list[5] + "," + \
                     list[6]
        if descriptionlog.__contains__(list[1]):
            print("Duplicate processor found. Not writing to list.")
        else:
            descriptionlog.append(list[1])
            print(csv_string)
            f.write(csv_string + "\n")

    containers.clear()

Exemple #2

0

Afficher le fichier

Fichier : Pi_Monitor.py Projet : johnpcole/Pi-Monitor

    def reporttohub(self, requestdata):

        if requestdata == "":
            webrequest = GenerateWebRequest(self.urlendpoint)
        else:
            webrequest = GenerateWebRequest(self.urlendpoint,
                                            data=requestdata.encode(
                                                'ascii', 'ignore'))

        tries = 0
        outcome = ""

        while tries < self.maximumtrieslimit:
            try:

                outcome = GetWebPage(webrequest)
                tries = 99999

            except WebError as errorobject:
                tries = tries + 1
                print("Error accessing Hub: ", errorobject.reason)

        if tries == 99999:
            outcome = outcome.read()
            outcome = outcome.decode('utf-8', 'ignore')

        else:
            print("Gave up accessing Hub")

        return outcome

Exemple #3

0

Afficher le fichier

Fichier : Search_algorithm.py Projet : OusmaneKana/Automation

def parse_page(url):

    x = Ureq(url)
    page = x.read()
    x.close()
    page_parsed = Bsoup(page, 'html.parser')

    return (page_parsed)

Exemple #4

0

Afficher le fichier

Fichier : codeforces.py Projet : ShouravAhmed/my-cp-helper

def get_json(url):
    req = Request(url, headers=hdr)
    page = Ureq(req)
    try:
        js = page.read().decode()
        js = json.loads(js)
    except:
        js = None
    return js

Exemple #5

0

Afficher le fichier

def getMostBoughtData():
    my_url = 'https://finance.yahoo.com/u/yahoo-finance/watchlists/most-bought-by-hedge-funds/'

    # saves the information from the url into the client
    Client = Req(my_url)

    # saves
    page_hmtl = Client.read()
    Client.close()

    # parses the html of the website
    page_soup = soup(page_hmtl, "html.parser")

    # finds all the parts of the webpage that would hold the titles
    symbols = page_soup.findAll("h2", {"class": "Fz(m)"})
    tickers = page_soup.findAll("a", {"class": "Fw(b)"})
    prices_html = page_soup.findAll(
        "td", {"class": "data-col2 Ta(end) Pstart(10px) Pend(6px) Fw(b)"})
    changes_html = page_soup.findAll(
        "td", {"class": "data-col4 Ta(end) Pstart(10px) Pend(6px)"})
    volumes_html = page_soup.findAll(
        "td", {"class": "data-col6 Ta(end) Pstart(10px) Pend(6px)"})
    avg_volumes_html = page_soup.findAll(
        "td", {"class": "data-col7 Ta(end) Pstart(10px) Pend(6px)"})

    sym = int(symbols[1].text[0:2])

    x = 0

    data = []

    while x < 48 - 1:
        if len(tickers[x].text
               ) > 5 or tickers[x].text == "Tech" or tickers[x].text == "News":
            x = x + 1
        else:
            break
    y = x
    while x < y + sym:
        ticker = tickers[x].text
        price = prices_html[x - (y + sym)].text

        change_str = changes_html[x - (y + sym)].text

        if change_str.startswith('+'):
            change = change_str[1:]
        else:
            change = change_str

        volume = volumes_html[x - (y + sym)].text
        avg_volume = avg_volumes_html[x - (y + sym)].text

        data.append((ticker, price, change, volume, avg_volume))

        x = x + 1

    return data

Exemple #6

0

Afficher le fichier

Fichier : utils.py Projet : ironmann250/python-wikiquotes

def json_from_url(url, params=None):
    try:
        from urllib.request import urlopen as Urlopen
        from urllib.parse import quote as Quote
    except ImportError:
        from urllib import pathname2url as Quote
        from urllib2 import urlopen as Urlopen
    if params:
        url += Quote(params)
    res = Urlopen(url)
    body = res.read().decode()
    return json.loads(body)

Exemple #7

0

Afficher le fichier

Fichier : Bitcoin_prices.py Projet : Chiheb-Nexus/Bitcoin-Price

	def __init__(self, url):
		"""
		Class constructor
		"""
		try:
			response = UrlOpen(url)
			content = response.read()
			data = Load(content.decode("UTF-8"))
			self.bitcoin_data(data)

		except HTTPError as error:
			print("Error Code: {}".format(error.code))

Exemple #8

0

Afficher le fichier

Fichier : hockey_bet.py Projet : JTMachen/Sponsio-Computatum

def hockey_bet():
    # Pull in url for schedule
    # TODO: Check date, and if it is not during the season, exit function
    url = 'https://www.hockey-reference.com/leagues/NHL_2020_games.html'
    # Run through BeautifulSoup steps
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    game = html.findAll(class_ = 'left')
    game = [team.get_text() for team in game]
    drop_list = ['Date','Visitor','Home','Notes','']
    # Clean data
    game = [game for game in game if game not in drop_list]
    bin_len = 3
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game) + 1):
        week = game[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Date','Visitor','Home']
    # Clean team names into readable format
    row_count = 0
    visitor = df['Visitor'].str.split(" ", expand = True) 
    home = df['Home'].str.split(" ", expand = True) 
    while row_count < len(df):
        if visitor[2][row_count] == None:
            df['Visitor'][row_count] = visitor[1][row_count]
        elif visitor[2][row_count] != None:
            df['Visitor'][row_count] = visitor[2][row_count]
        if home[2][row_count] == None:
            df['Home'][row_count] = home[1][row_count]
        elif home[2][row_count] != None:
            df['Home'][row_count] = home[2][row_count]
        row_count += 1
    # Only select todays games
    todays_date = datetime.now().strftime('%Y-%m-%d')
    todays_games = df[df['Date'] == todays_date]
    todays_games = todays_games.reset_index()
    todays_games = todays_games[['Visitor','Home']]
    return todays_games

Exemple #9

0

Afficher le fichier

Fichier : insider_trading_data.py Projet : vmalepati1/Stock-Screener-Dashboard

def get_insider_trading_data():
    my_url = 'http://openinsider.com/screener?s=&o=&pl=&ph=&ll=&lh=&fd=730&fdr=&td=0&tdr=&fdlyl=&fdlyh=&daysago=&xp=1&vl=&vh=&ocl=&och=&sic1=-1&sicl=100&sich=9999&grp=0&nfl=&nfh=&nil=&nih=&nol=&noh=&v2l=&v2h=&oc2l=&oc2h=&sortcol=0&cnt=1000&page=1'

    Client = Req(my_url)

    page_html = Client.read()
    Client.close()

    page_soup = soup(page_html, "html.parser")

    # finds the ticker locations using this method
    findsT = page_soup.findAll("a",{"onmouseout":"UnTip()"})

    # finds the title of the person that was insider trading
    findsTi = page_soup.findAll("td")

    # finds the title of the person trading on the inside
    findsNum = page_soup.findAll("td",{"align":"right"})

    x = 0 # variable for the title
    y = 22 # finds the price of the stock when bought
    z = 23 # finds the quantity of stocks purchased
    w = 24 # finds the number of stocks already owned
    a = 72 # finds the title of the person insider trading

    data = []
    
    while x < 1000:
        findT = findsT[x]
        findT = findT["href"].replace("/", "")
        findP = findsNum[y].text
        findQ = findsNum[z].text
        findO = findsNum[w].text
        findTi = findsTi[a].text
        data.append((findT, findTi, findP, findQ, findO))
        x += 1
        y += 12
        z += 12
        w += 12
        a += 17

    return data

Exemple #10

0

Afficher le fichier

Fichier : unsup.py Projet : imvemuri/PROJECT_TS_UNSUP_SUP

def getCount(switch,phrase):
    print('Checking for phrase:'+str(phrase))
    Count = 0
    try: 
        headers = {}
        headers['User-Agent'] = useragent()
        if switch == 'err':
            return 0;
        query = URLENCODE({'q': phrase})
        URL = SEARCH['Google'] % query
        REQ = REQUEST(URL,headers=headers)
        RESP = OPEN(REQ)
        RESULTS = RESP.read().decode("utf-8")
        JSON_RES = LOAD(RESULTS)
        #Suspected Terms of Service Abuse
        if JSON_RES['responseStatus'] == 403:
            if switch == 'False':
                print("Suspicion detected by Google-Sleep for 30 seconds")
                time.sleep(30)  # Sleep 20 seconds after blockge.
                getCount('True',phrase)
            elif switch == 'True':
                print("Suspicion detected by Google-Sleep for 60 seconds")
                time.sleep(60) #Sleep, atleast safe execution
                getCount('err',phrase)
        elif JSON_RES['responseStatus'] == 200: #Everything is good
            DATA = JSON_RES['responseData']
            if len(DATA['results']) > 1 : #Looks stupid but this is how it works
                count = DATA['cursor']['estimatedResultCount'];
                Count = int(count)
            else :
                Count = 1
            time.sleep(3)
            return Count

    except Exception as ex:
        print(str(ex))

Exemple #11

0

Afficher le fichier

Fichier : baseball_bet.py Projet : JTMachen/Sponsio-Computatum

def baseball_bet():
    # Set the current date in a readable form and the form used for the html
    todays_date = datetime.now().strftime('%m-%d-%Y')
    date_html = datetime.now().strftime('%Y%m%d')
    # Set Opening Day date
    openeing_day = "03-26-2020"
    # Parse OD date
    OD = datetime.strptime(openeing_day, "%m-%d-%Y")
    # Set current date
    present = datetime.now()
    # If it is before OD, return from function
    if present.date() < OP.date():
        print('Opening Day is not until March 26. Please come back then.')
        return
    # Set url for todays date if season has already started
    url = 'https://www.espn.com/mlb/schedule/_/date/' + date_html
    # Make sure that there are acutally games being played
    # If there are not, the url will not work
    try:
        uClient = Ureq(url)
        raw_content = uClient.read()
    except:
        print('There are no games being played on this day.')
        return
    # Run through BeautifulSoup steps to pull out desired data
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    game = html.findAll(class_='external')
    game_date_list = []
    # Fix dates given into readable datetime format
    for x in range(1, len(game)):
        game_date = game[x]['href'].split('/')[5].split('-')[-3:-1]
        game_date.append('2020')
        sent_str = ""
        for i in game_date:
            sent_str += str(i) + "-"
        sent_str = sent_str[:-1]
        date = datetime.strptime(sent_str, '%m-%d-%Y')
        date = date.strftime('%m-%d-%Y')
        game_date_list.append(date)
    # Get the names of the teams that are playing on that day
    game = html.findAll(class_='team-name')
    game = [team.get_text() for team in game]
    game_list = []
    for item in game:
        # The abbrvs are only the last three characters in the str
        item = item[-3:]
        game_list.append(item)
    # Split home and away teams from the list of cleaned teams
    bin_len = 2
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game_list) + 1):
        week = game_list[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Visitor', 'Home']
    df['Date'] = game_date_list
    todays_games = df[df['Date'] == todays_date]
    # Apply the lambda function that will clean the team names into more colloquial names
    todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x])
    todays_games['Visitor'] = todays_games['Visitor'].apply(
        lambda x: teams_dict[x])
    # return data frame of games that are being played today
    return todays_games

Exemple #12

0

Afficher le fichier

Fichier : newegg.py Projet : codeKAMAT/100DaysOfCode-1

import bs4
from urllib.request import urlopen as Req
from bs4 import BeautifulSoup as soup

my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'

uclient = Req(my_url)
page_html = uclient.read()
uclient.close()

page_soup = soup(page_html, "html.parser")

containers = page_soup.findAll("div", {"class": "item-container"})

filename = "products_newegg.csv"
f = open(filename, "w")

headers = "Brand, Product_name\n"
f.write(headers)

for container in containers:
    brand = container.div.div.a.img["title"]

    title_container = container.findAll("a", {"class": "item-title"})
    product_name = title_container[0].text

    print("Brand: " + brand)
    print("Product Name: " + product_name)

    f.write(brand + "," + product_name.replace(",", "|") + "\n")

Exemple #13

0

Afficher le fichier

    def scrape():
        ####################################################################################
        concat = Sentry.get()
        #my_url = "file:///C:/Users/Adam-22-26/Desktop/graphics%20card%20-%20Newegg.com.html"
        my_url = 'https://www.newegg.com/global/ph-en/p/pl?d={}'.format(concat)
        my_url = my_url.replace(' ', '+')
        ####################################################################################
        uClient = Ureq(my_url)

        page_html = uClient.read()
        uClient.close()
        #html_parsing
        page_soup = Soup(page_html, "html.parser")
        #grabe each
        containers = page_soup.findAll("div", {"class": "item-container"})

        #manufacturer = page_soup.findAll("label",{"class": "form-checkbox"})
        #print(manufacturer )
        #print(len(containers))
        #print(containers[5:])
        #container = containers[5]
        #---------------------------------------- save the csv files
        fileName = "{}.csv".format(
            concat)  ###############################################

        f = open(fileName, "w")
        headers = "BRAND     , PRICES    ,  SAVES    , TITLES   , LINK    \n"  #
        f.write(headers)

        for container in containers[4:]:
            #---------------------------------------------------------
            brand_container = container.findAll("a", {"class": "item-brand"})
            brand = brand_container[0].img["title"]  #brand name

            #-------------------------------------------------------------------
            may_know = container.findAll("a", {"class": "item-title"})
            #print(may_know)

            ####################################################################
            title = container.a.img["title"]  #Name of selling
            #print(container)
            #######################################################3
            hyper = brand_container[0]["href"]
            #hyper = container.findAll("div",{"class": "item-info"})
            #hyper = hypers.a
            #print(hyper)
            #--------------------------------------------------------------
            price_container = container.findAll("li",
                                                {"class": "price-current"})
            price_container2 = price_container[0].strong
            price = re.findall(r'.\d.\d\d\d', str(price_container2))
            prices = ''.join(price)
            #------------------------------------------------------------------------
            save_container = container.findAll("span",
                                               {"class": "price-save-percent"})
            save = re.findall(r'\d\d.', str(save_container))
            saves = ''.join(save)

            if saves == '':
                saves = "None"
            else:
                saves = saves
            if prices == "":
                prices = "Not Available"
            else:
                prices = prices

            brandlistbox.insert(END, " :   " + brand)
            pricelistbox.insert(END, "₱ " + prices)
            savelistbox.insert(END, saves)
            Listbox4.insert(END, " :   " + title)
            hyperlink.insert(END, '  ' + hyper)
            #-------------------------------------------------------------------------

            f.write(
                brand.replace(',', '') + ", " + prices.replace(
                    ',', '.').replace('0', '1').replace('>', '    ') + ',' +
                saves.replace('', '').replace('None', '0%') + ', ' +
                title.replace(',', '') + ', ' + hyper + "\n")

        f.close()
        new_win = Button(window,
                         width=10,
                         text="New_Win",
                         command=mainwindow,
                         height=1,
                         font="Jokerman",
                         relief=RAISED,
                         activebackground="LightBlue1",
                         background='sky blue')
        new_win.place(x=105, y=90)
        messagebox.showinfo("Happens", "DONE! \n press ok to proceed")

Exemple #14

0

Afficher le fichier

import re
from statistics import mean
from urllib.request import urlopen as Req
from bs4 import BeautifulSoup as soup

# Declare my_url variable

my_url = "https://www.carfax.com/Used-Honda-Civic-Type-R_t10063"

# Load my_url contents into Scrapee variable

Scrapee = Req(my_url)

# Extract html to variable Scrapee_html

Scrapee_html = Scrapee.read()

# Close web page

Scrapee.close()

# Parse html into node tree and strip html tags, store as variable Scrapee_soup

Scrapee_soup = soup(Scrapee_html, "html.parser")

#Find matching class data and store into three variables

Scrapee_soup_model = Scrapee_soup.findAll(
    "span", {"class": "srp-list-item-basic-info-model"})

Scrapee_soup_price = Scrapee_soup.findAll("span",

Exemple #15

0

Afficher le fichier

Fichier : Imgur.py Projet : vicjung/imgur-python

 def retrieve_raw(self, request):
     request = self.auth.add_authorization_header(request)
     req = UrlLibOpen(request)
     res = json.loads(req.read().decode('utf-8'))
     return (req, res)

Exemple #16

0

Afficher le fichier

from urllib.request import urlopen as UReq
from bs4 import BeautifulSoup as beau

myurl = 'https://campinascomprelocal.com.br/tipo/bares/'
print(myurl)

# open connection page
uClient = UReq(myurl)
page_html = uClient.read()
uClient.close()

soup = beau(page_html, 'lxml')

contents = soup.title
print(contents)

Exemple #17

0

Afficher le fichier

from urllib.request import urlopen as URL
import bs4
from bs4 import BeautifulSoup as BS

#Created a tuple for which need to scrape



GetURL = ("https://www.moneycontrol.com/india/stockpricequote/computers-software/infosys/IT",
"https://www.moneycontrol.com/india/stockpricequote/computers-software/tataconsultancyservices/TCS")
#Blank List to store values
printlist = []
for eachURL in GetURL:
    
    html = URL(eachURL)
    HTML=html.read()

    GetHTMLData = BS(HTML, 'lxml')

    title = GetHTMLData.title
#print(title)
    text = title.get_text()
    printlist.append(text)

    rows = GetHTMLData.find_all('span')

    for row  in rows:
            Price_Item = row.get("id")
    
            if (Price_Item == "Bse_Prc_tick") or (Price_Item =="Nse_Prc_tick"):

Exemple #18

0

Afficher le fichier

Fichier : webscrapping1.py Projet : SandeepTukkunor/100_days_of_code

from urllib.request import urlopen as Ureq
from bs4 import BeautifulSoup as soup
my_url = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38"
#opening url and grabbing page
uClint = Ureq(my_url)
page_html = uClint.read()
uClint.close()

#html parser
page_soup = soup(page_html, "html.parser")
#print(page_soup.h1) # prints H1
#print(page_soup.p)# prints paragraphs

#print(page_soup.body.div)

#grab each product

containers = page_soup.findAll("div", {"class": "item-container"})
print(len(containers))

# to open a file
file_name = "product.csv"
f = open(file_name, "w")
headers = "Brand", "prouct name", "Shipping\n"

f.write("Brand, product name, shippig\n")

#below 3 lines of code is for container 1 that is 0th
#container = containers[0]
#print(container.a)
#print(container.div.div.a.img["title"]) # will return the title

Exemple #19

0

Afficher le fichier

Fichier : Imgur.py Projet : PoorLonesomeCoder/imgur-python

 def retrieve_raw(self, request):
     request = self.auth.add_authorization_header(request)
     req = UrlLibOpen(request)
     res = json.loads(req.read().decode('utf-8'))
     return (req, res)

Exemple #20

0

Afficher le fichier

Fichier : scrapetest.py Projet : Coslate/Crawler

#! /usr/bin/env python3.6
from urllib.request import urlopen as Uop
from bs4 import BeautifulSoup as BS4Soup

html = Uop("http://pythonscraping.com/pages/page1.html")
bs_obj = BS4Soup(html.read(), "html.parser")

print("bs_obj.html.body.h1 = ", bs_obj.html.body.h1)
print("bs_obj.html.h1 = ", bs_obj.html.h1)
print("bs_obj.body.h1 = ", bs_obj.body.h1)
print("bs_obj.h1 = ", bs_obj.h1)
#print(bs_obj)
#print(html.read())

#bs_obj_local_html = BS4Soup("./simple_prac.html", "html.parser")
#print("bs_obj_local_html.h2 = ", bs_obj_local_html.h2)

Exemple #21

0

Afficher le fichier

from bs4 import BeautifulSoup as Soup
from selenium import webdriver

#The Website
driver = webdriver.Firefox()
driver.get(
    "https://dubai.dubizzle.com/en/property-for-sale/residential/?filters=(listed_by.value%3A%22LA%22)"
)
my_url = (
    "https://dubai.dubizzle.com/en/property-for-sale/residential/?filters=(listed_by.value%3A%22LA%22)"
)
# ----------------------------------------------------------------------------------------------------------------------
# Uclient downloads the Url which is stored in the variable my_url
Uclient = UReq(my_url)
# This reads my HTML which has been downloaded
Html = Uclient.read()
# Closes the HTML to prevent the console from crashing
Uclient.close()
# -----------------------------------------------------------------------------------------------------------------------
# Parses the HTML
Page_soup = Soup(Html, "html.parser")
# Grabs each product
# mobile = driver.find_elements_by_xpath('//span[@class="call-modal__phone_number"]')
modals = driver.find_elements_by_xpath('//*[@data-testid="lpv-call-button"]')
containers = Page_soup.findAll("div",
                               {"class": "ListItem__Root-sc-1i3osc0-1 hMPXKC"})

# Creating the file, the headers and the name of the file
filename = "properties.csv"
f = open(filename, "w")
headers = "Property name" + '|' + "Location" + '|' + "Price" + '|' + "Bedrooms" + '|' + "PhoneNumber" + "\n"

Exemple #22

0

Afficher le fichier

Fichier : Homedy.py Projet : baobaoack000/WebScrapping-homedy.com-Jupyter-

def GetUrl(url):
    Uclient = Req(url)
    page_html = Uclient.read()
    Uclient.close()
    page_soup = Soup(page_html, "html.parser")
    return page_soup

Exemple #23

0

Afficher le fichier

Fichier : scraping.py Projet : pockemon/Web-Scraping

from urllib.request import urlopen as Ureq
from bs4 import BeautifulSoup as soup

my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'

# opening up connection, grabbing the page
Uclient = Ureq(my_url)

#it offloads the content into a variable
page_html = Uclient.read()

#close the connection
Uclient.close()

# html parsing
page_soup = soup(page_html, "html.parser")

# grabs each product
containers = page_soup.findAll("div", {"class": "item-container"})

for container in containers:
    brand = container.div.div.a.img["title"]
    
    title_container = container.findAll("a",{"class":"item-title"})
    product_name = title_container[0].text

    shipping_container = container.findAll("li",{"class":"price-ship"})
    shipping = shipping_container[0].text.strip()

    print("brand: " + brand)
    print("product_name: " + product_name)

Exemple #24

0

Afficher le fichier

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# importing libraries
from datetime import datetime
from datetime import timedelta
from urllib.request import urlopen as UReq
from bs4 import BeautifulSoup as soup
now = datetime.now()
todaysDay = now.day
todaysMonth = now.month

#Fetching altomfotball webpage
my_url = "http://www.altomfotball.no/element.do?cmd=tournament&tournamentId=1&useFullUrl=false"
uClient = UReq(my_url)
oversiktsside = uClient.read()
uClient.close()

#Tables for upcoming matches
datoliste = []
rundeliste = []
konkurranseliste = []
hjemmelagliste = []
stillingliste = []
bortelagliste = []
kanalliste = []

#HTML parsing
page_soup = soup(oversiktsside, "html.parser")

#fills the individual lists with information about the upcoming round

Exemple #25

0

Afficher le fichier

def hockey_win(date):
    url = 'https://www.hockey-reference.com/leagues/NHL_2020_games.html'
    # Run through BeautifulSoup steps
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    game = html.findAll(class_ = 'left')
    results = html.findAll(class_ = 'right')
    game = [team.get_text() for team in game]
    results = [team.get_text() for team in results]
    results_drop = ['LOG']
    results = [results for results in results if results not in results_drop]
    drop_list = ['Date','Visitor','Home','Notes','']
    # Clean data
    game = [game for game in game if game not in drop_list]
    bin_len = 3
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game) + 1):
        week = game[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Date','Visitor','Home']
    # Clean team names into readable format
    bin_len = 4
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(results) + 1):
        week = results[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df_1 = pd.DataFrame(week_list)
    row_count = 0
    visitor = df['Visitor'].str.split(" ", expand = True) 
    home = df['Home'].str.split(" ", expand = True) 
    while row_count < len(df):
        if visitor[2][row_count] == None:
            df['Visitor'][row_count] = visitor[1][row_count]
        elif visitor[2][row_count] != None:
            df['Visitor'][row_count] = visitor[2][row_count]
        if home[2][row_count] == None:
            df['Home'][row_count] = home[1][row_count]
        elif home[2][row_count] != None:
            df['Home'][row_count] = home[2][row_count]
        row_count += 1
    # Only select todays games
    df_1.columns = ['Visitor_Goals','Home_Goals','Attendance','Time']
    total_df = pd.concat([df,df_1],axis=1,join='inner')
    win_count = 0
    win_list = []
    while win_count < len(total_df):
        if (total_df['Visitor_Goals'][win_count]) > (total_df['Home_Goals'][win_count]):
            win_list.append(total_df['Visitor'][win_count])
        elif (total_df['Home_Goals'][win_count]) > (total_df['Visitor_Goals'][win_count]):
            win_list.append(total_df['Home'][win_count])
        elif (total_df['Home_Goals'][win_count]) != '' and (total_df['Visitor_Goals'][win_count]) != '' and (total_df['Home_Goals'][win_count]) == (total_df['Visitor_Goals'][win_count]):
            win_list.append('Tie')
        else:
            win_list.append('Incomplete') 
        win_count += 1
    total_df['Winner'] = win_list
    todays_games = total_df[total_df['Date'] == date]
    todays_games = todays_games.reset_index()
    return todays_games

Exemple #26

0

Afficher le fichier

def basketball_bet():
    # Get the current month and day in order to get the games playing today
    current_month_text = datetime.now().strftime('%B').lower()
    current_day = datetime.now().strftime('%d')
    # Pull the url based on the current month
    try:
        url = 'https://www.basketball-reference.com/leagues/NBA_2020_games-' + current_month_text + '.html'
    except:
        print('There are currently no basketball games being played today')
        return
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    schedule_text = html.findAll(class_="left")
    # Get the text from the html
    schedule = [game.get_text() for game in schedule_text]
    # Fill dataframe with game date, visiting team name, and home team name
    bin_len = 3
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(schedule) + 1):
        week = schedule[start:end]
        start = end + 1
        end = start + bin_len
        week_list.append(week)
    df_1 = pd.DataFrame(week_list)
    df_1.columns = ['Date', 'Visitor', 'Home']
    # Clean all of the comlumns
    row_count = 0
    new = df_1['Date'].str.split(" ", n=3, expand=True)
    while row_count < len(df_1):
        df_1['Date'][row_count] = new[2][row_count][:-1]
        row_count += 1
    game_time = html.findAll(class_='right')
    game_time = [team.get_text() for team in game_time]
    bin_len = 4
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game_time) + 1):
        week = game_time[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Game_Time (EST)', 'Stat1', 'Stat2', 'Stat3']
    df = df['Game_Time (EST)']
    # Concat the dataframes to get desired data
    todays_games = pd.concat([df_1, df], axis=1, join='inner')
    todays_games = todays_games[todays_games['Date'] == current_day]
    # If there are no games being played, exit function
    if len(todays_games) == 0:
        print('There are currently no basketball games being played today.')
        return
    # Clean team names into more readable forms
    todays_games = todays_games.reset_index()
    todays_games = todays_games[['Visitor', 'Home', 'Game_Time (EST)']]
    todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x])
    todays_games['Visitor'] = todays_games['Visitor'].apply(
        lambda x: teams_dict[x])
    # Return games being played today
    return todays_games

Exemple #27

0

Afficher le fichier

Fichier : webscraping_processors.py Projet : roshan-k-patel/webscrape-desktop-processor-data

def remainingPagesScrape(f):
    page = 2
    duplicateCount = 0
    link = 'https://www.newegg.com/Processors-Desktops/SubCategory/ID-343/Page-'

    while True:
        try:
            address = link + str(page)
            print()
            print("Preparing to Scrape Page: " + str(page))
            print("Address: " + address)
            print()

            # opening up connection grabbing the page
            uClient = UReq(address)
            page_html = uClient.read()
            uClient.close()

            # html parsing
            page_soup = soup(page_html, "html.parser")

            # add each processor item container to a list of containers
            containers = page_soup.findAll("div", {"class": "item-container"})

            for container in containers:
                list = (containerScrape(container))
                csv_string = list[0] + "," + list[1] + "," + list[
                    2] + "," + list[3] + "," + list[4] + "," + list[
                        5] + "," + list[6]
                if descriptionlog.__contains__(list[1]):
                    print("Duplicate processor found. Not writing to list.")
                    duplicateCount = duplicateCount + 1
                else:
                    descriptionlog.append(list[1])
                    print(csv_string)
                    f.write(csv_string + "\n")
            containers.clear()

            if duplicateCount > 100:
                print()
                print(
                    "Duplicate Count Is " + str(duplicateCount) +
                    ". This Suggests The Data Is Being Reiterated. The Script Will Stop."
                )
                print("Processor Scrape Complete")
                print()
                print("Traversed " + str(page) + " Pages")
                print(
                    str(descriptionlog.__len__()) + " Unique Processors Found")
                print()
                print("Data Written To: " + f.name)
                f.close()
                break

            page = page + 1

        except IndexError as e:
            print()
            page = page + 1
            # f.close()
            print("So Far We Have Traversed " + str(page - 1) + " Pages")
            print(str(descriptionlog.__len__()) + " Unique Processors Found")
            print(str(duplicateCount) + " Duplicates Ignored")

Exemple #28

0

Afficher le fichier

Fichier : cScrape.py Projet : chandshilpa/Scraping

 def getPage(self):
     uClient = UReq(self.url)
     self.page_html = uClient.read()
     uClient.close()

Exemple #29

0

Afficher le fichier

Fichier : 1_wbsc.py Projet : banana6742/machinelearning

from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as Req

url = "https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off"

Client = Req(url)

page_html = Client.read()

page_soup = soup(page_html, "html.parser")

# print(page_soup.contents)

containers = page_soup.findAll("div", {"class": "_3O0U0u"})

container = containers[0]

print(container.div.img['alt'])

price = container.findAll("div", {"class": "col col-5-12 _2o7WAb"})

print(price[0].text)

rating = container.findAll("div", {"class": "hGSR34"})

print(rating[0].text)

filename = "flpphn.csv"
f = open(filename, "w")

headers = "Products_Name,Pricing,Ratings\n"

Exemple #30

0

Afficher le fichier

def fetchPage(params={}):
    get = params.get
    link = get("link")
    ret_obj = {}
    if get("post_data"):
        log("called for : " + repr(params['link']))
    else:
        log("called for : " + repr(params))

    if not link or int(get("error", "0")) > 2:
        log("giving up")
        ret_obj["status"] = 500
        return ret_obj

    if get("post_data"):
        if get("hide_post_data"):
            log("Posting data")
        else:
            log("Posting data: " + urlencode(get("post_data")))

        request = HTTPRequest(link, urlencode(get("post_data")))
        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
    else:
        log("Got request")
        request = HTTPRequest(link)

    if get("headers"):
        for head in get("headers"):
            request.add_header(head[0], head[1])

    request.add_header('User-Agent', USERAGENT)

    if get("cookie"):
        request.add_header('Cookie', get("cookie"))

    if get("refering"):
        request.add_header('Referer', get("refering"))

    try:
        log("connecting to server...")

        con = OpenRequest(request)
        ret_obj["header"] = con.info()
        ret_obj["new_url"] = con.geturl()
        if get("no-content", "false") == u"false" or get(
                "no-content", "false") == "false":
            inputdata = con.read()
            ret_obj["content"] = inputdata.decode("utf-8")

        con.close()

        log("Done")
        ret_obj["status"] = 200
        return ret_obj

    except HTTPError as e:
        err = str(e)
        log("HTTPError : " + err)
        log("HTTPError - Headers: " + str(e.headers) + " - Content: " +
            e.fp.read())

        params["error"] = str(int(get("error", "0")) + 1)
        ret = fetchPage(params)

        if not "content" in ret and e.fp:
            ret["content"] = e.fp.read()
            return ret

        ret_obj["status"] = 500
        return ret_obj

Exemple #31

0

Afficher le fichier

Fichier : pitchfork_web_scrape.py Projet : mehmetfazil/pitchfork_web_scraping

#Iterate through every page on https://pitchfork.com/reviews/albums/
	while(True):
		url = (base_url_main_page+"?page="+str(page_numbers))

		#iterate through until no page is found. Ignore other HTTP response errors
		try:
			response = Ureq(url)
		except urllib.error.HTTPError as e:
			error_message = e.read()
			if e.getcode() == 404:
				sys.exit("No page found")
			else:
				print(error_message)
		else:
			page_html = response.read()
			page_soup = soup(page_html, "html.parser")

			url_names = page_soup.findAll("div",class_= "review")

			count = 0
			#enter urls of album reviews
			for item in url_names:
				url_name = url_names[count].a["href"]

				album_url = (base_url_album_pages+url_name)

				#ignore HTTP response errors
				try: 
					album_response = Ureq(album_url)
				except urllib.error.HTTPError as ea:

Exemple #32

0

Afficher le fichier

Fichier : basketball_win.py Projet : JTMachen/Sponsio-Computatum

def basketball_win(date):
    current_month = date[0:2]
    current_day = date[3:5]
    string = current_month
    current_month_text = datetime.strptime(string, "%m")
    current_month_text = datetime.strftime(current_month_text, "%B").lower()
    # Pull the url based on the current month
    try:
        url = 'https://www.basketball-reference.com/leagues/NBA_2020_games-' + current_month_text + '.html'
    except:
        print('There are currently no basketball games being played today')
        return
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    schedule_text = html.findAll(class_="left")
    # Get the text from the html
    schedule = [game.get_text() for game in schedule_text]
    # Fill dataframe with game date, visiting team name, and home team name
    bin_len = 3
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(schedule) + 1):
        week = schedule[start:end]
        start = end + 1
        end = start + bin_len
        week_list.append(week)
    df_1 = pd.DataFrame(week_list)
    df_1.columns = ['Date', 'Visitor', 'Home']
    # Clean all of the comlumns
    row_count = 0
    new = df_1['Date'].str.split(" ", n=3, expand=True)
    while row_count < len(df_1):
        df_1['Date'][row_count] = new[2][row_count][:-1]
        row_count += 1

    game_time = html.findAll(class_='right')
    game_time = [team.get_text() for team in game_time]
    bin_len = 4
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game_time) + 1):
        week = game_time[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Game_Time', 'Visitor_Points', 'Home_Points', 'Stat3']
    df.drop(columns=['Stat3'], inplace=True)
    total_df = pd.concat([df_1, df], axis=1, join='inner')
    win_list = []
    row_count = 0
    for row in total_df['Date']:
        if (total_df['Visitor_Points'][row_count]) > (
                total_df['Home_Points'][row_count]):
            win_list.append(total_df['Visitor'][row_count])
        elif (total_df['Home_Points'][row_count]) > (
                total_df['Visitor_Points'][row_count]):
            win_list.append(total_df['Home'][row_count])
        elif (total_df['Home_Points'][row_count]) != '' and (
                total_df['Visitor_Points'][row_count]) != '' and (
                    total_df['Home_Points'][row_count]) == (
                        total_df['Visitor_Points'][row_count]):
            win_list.append('Tie')
        else:
            win_list.append('Incomplete')
        row_count += 1
    total_df['Winner'] = win_list
    todays_games = total_df[total_df['Date'] == current_day]
    if len(todays_games) == 0:
        print('There are currently no basketball games being played today.')
    todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x])
    todays_games['Visitor'] = todays_games['Visitor'].apply(
        lambda x: teams_dict[x])
    todays_games['Winner'] = todays_games['Winner'].apply(
        lambda x: teams_dict[x])
    return todays_games

Exemple #33

0

Afficher le fichier

Fichier : football_bet.py Projet : JTMachen/Sponsio-Computatum

def football_bet():
    # Ensure that the football season is currently going on
    year_date = datetime.now().strftime('%Y-%m-%d')
    if year_date > 'February 2 2020' and year_date < 'September 10 2020':
        print(
            "The next football season hasn't begun yet. Please come back on September 10."
        )
        return
    elif year_date < 'February 2 2020':
        url = 'https://www.pro-football-reference.com/years/2019/games.htm'
    else:
        url = 'https://www.pro-football-reference.com/years/2020/games.htm'
    # Run through BeautifulSoup steps to pull wanted data
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    teams_win_loss = html.findAll(class_='left')
    game = html.findAll(class_='right')
    game = [team.get_text() for team in game]
    teams_win_loss = [team.get_text() for team in teams_win_loss]
    removal = ['Day']
    teams_win_loss = [item for item in teams_win_loss if item not in removal]
    # Set todays date that will be used to select todays games
    date = datetime.now().strftime('%B %d')
    # Clean stats
    bin_len = 8
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game) + 1):
        week = game[start:end]
        start = end + 1
        end = start + bin_len
        week_list.append(week)
    df_1 = pd.DataFrame(week_list)
    df_1.columns = [
        'Game_Week', 'Time (EST)', 'Stat1', 'Stat2', 'Stat3', 'Stat4', 'Stat5',
        'Stat6'
    ]

    bin_len = 4
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(teams_win_loss) + 1):
        week = teams_win_loss[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df_2 = pd.DataFrame(week_list)
    df_2.columns = ['Day_Of_Week', 'Date', 'Home', 'Visitor']
    # Concat data frames
    football = pd.concat(
        [df_1[['Game_Week', 'Time (EST)']], df_2[['Date', 'Home', 'Visitor']]],
        axis=1,
        join='inner')
    # Select only games being played today
    todays_games = football[football['Date'] == date]
    # Return dataframe
    return todays_games