def mainPageScrape(f):
    address = "https://www.newegg.com/Processors-Desktops/SubCategory/ID-343"

    # opening up connection grabbing the page
    uClient = UReq(address)
    page_html = uClient.read()
    uClient.close()

    # html parsing
    page_soup = soup(page_html, "html.parser")

    # add each processor item container to a list of containers
    containers = page_soup.findAll("div", {"class": "item-container"})

    for container in containers:
        list = (containerScrape(container))
        csv_string = list[0] + "," + list[1] + "," + list[2] + "," + list[3] + "," + list[4] + "," + list[5] + "," + \
                     list[6]
        if descriptionlog.__contains__(list[1]):
            print("Duplicate processor found. Not writing to list.")
        else:
            descriptionlog.append(list[1])
            print(csv_string)
            f.write(csv_string + "\n")

    containers.clear()
def parse_page(url):

    x = Ureq(url)
    page = x.read()
    x.close()
    page_parsed = Bsoup(page, 'html.parser')

    return (page_parsed)
Exemple #3
0
def getMostBoughtData():
    my_url = 'https://finance.yahoo.com/u/yahoo-finance/watchlists/most-bought-by-hedge-funds/'

    # saves the information from the url into the client
    Client = Req(my_url)

    # saves
    page_hmtl = Client.read()
    Client.close()

    # parses the html of the website
    page_soup = soup(page_hmtl, "html.parser")

    # finds all the parts of the webpage that would hold the titles
    symbols = page_soup.findAll("h2", {"class": "Fz(m)"})
    tickers = page_soup.findAll("a", {"class": "Fw(b)"})
    prices_html = page_soup.findAll(
        "td", {"class": "data-col2 Ta(end) Pstart(10px) Pend(6px) Fw(b)"})
    changes_html = page_soup.findAll(
        "td", {"class": "data-col4 Ta(end) Pstart(10px) Pend(6px)"})
    volumes_html = page_soup.findAll(
        "td", {"class": "data-col6 Ta(end) Pstart(10px) Pend(6px)"})
    avg_volumes_html = page_soup.findAll(
        "td", {"class": "data-col7 Ta(end) Pstart(10px) Pend(6px)"})

    sym = int(symbols[1].text[0:2])

    x = 0

    data = []

    while x < 48 - 1:
        if len(tickers[x].text
               ) > 5 or tickers[x].text == "Tech" or tickers[x].text == "News":
            x = x + 1
        else:
            break
    y = x
    while x < y + sym:
        ticker = tickers[x].text
        price = prices_html[x - (y + sym)].text

        change_str = changes_html[x - (y + sym)].text

        if change_str.startswith('+'):
            change = change_str[1:]
        else:
            change = change_str

        volume = volumes_html[x - (y + sym)].text
        avg_volume = avg_volumes_html[x - (y + sym)].text

        data.append((ticker, price, change, volume, avg_volume))

        x = x + 1

    return data
def get_insider_trading_data():
    my_url = 'http://openinsider.com/screener?s=&o=&pl=&ph=&ll=&lh=&fd=730&fdr=&td=0&tdr=&fdlyl=&fdlyh=&daysago=&xp=1&vl=&vh=&ocl=&och=&sic1=-1&sicl=100&sich=9999&grp=0&nfl=&nfh=&nil=&nih=&nol=&noh=&v2l=&v2h=&oc2l=&oc2h=&sortcol=0&cnt=1000&page=1'

    Client = Req(my_url)

    page_html = Client.read()
    Client.close()

    page_soup = soup(page_html, "html.parser")

    # finds the ticker locations using this method
    findsT = page_soup.findAll("a",{"onmouseout":"UnTip()"})

    # finds the title of the person that was insider trading
    findsTi = page_soup.findAll("td")

    # finds the title of the person trading on the inside
    findsNum = page_soup.findAll("td",{"align":"right"})

    x = 0 # variable for the title
    y = 22 # finds the price of the stock when bought
    z = 23 # finds the quantity of stocks purchased
    w = 24 # finds the number of stocks already owned
    a = 72 # finds the title of the person insider trading

    data = []
    
    while x < 1000:
        findT = findsT[x]
        findT = findT["href"].replace("/", "")
        findP = findsNum[y].text
        findQ = findsNum[z].text
        findO = findsNum[w].text
        findTi = findsTi[a].text
        data.append((findT, findTi, findP, findQ, findO))
        x += 1
        y += 12
        z += 12
        w += 12
        a += 17

    return data
Exemple #5
0
 def getPage(self):
     uClient = UReq(self.url)
     self.page_html = uClient.read()
     uClient.close()
from urllib.request import urlopen as Ureq
from bs4 import BeautifulSoup as soup
my_url = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38"
#opening url and grabbing page
uClint = Ureq(my_url)
page_html = uClint.read()
uClint.close()

#html parser
page_soup = soup(page_html, "html.parser")
#print(page_soup.h1) # prints H1
#print(page_soup.p)# prints paragraphs

#print(page_soup.body.div)

#grab each product

containers = page_soup.findAll("div", {"class": "item-container"})
print(len(containers))

# to open a file
file_name = "product.csv"
f = open(file_name, "w")
headers = "Brand", "prouct name", "Shipping\n"

f.write("Brand, product name, shippig\n")

#below 3 lines of code is for container 1 that is 0th
#container = containers[0]
#print(container.a)
#print(container.div.div.a.img["title"]) # will return the title
Exemple #7
0
import bs4
from urllib.request import urlopen as Req
from bs4 import BeautifulSoup as soup

my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'

uclient = Req(my_url)
page_html = uclient.read()
uclient.close()

page_soup = soup(page_html, "html.parser")

containers = page_soup.findAll("div", {"class": "item-container"})

filename = "products_newegg.csv"
f = open(filename, "w")

headers = "Brand, Product_name\n"
f.write(headers)

for container in containers:
    brand = container.div.div.a.img["title"]

    title_container = container.findAll("a", {"class": "item-title"})
    product_name = title_container[0].text

    print("Brand: " + brand)
    print("Product Name: " + product_name)

    f.write(brand + "," + product_name.replace(",", "|") + "\n")
def remainingPagesScrape(f):
    page = 2
    duplicateCount = 0
    link = 'https://www.newegg.com/Processors-Desktops/SubCategory/ID-343/Page-'

    while True:
        try:
            address = link + str(page)
            print()
            print("Preparing to Scrape Page: " + str(page))
            print("Address: " + address)
            print()

            # opening up connection grabbing the page
            uClient = UReq(address)
            page_html = uClient.read()
            uClient.close()

            # html parsing
            page_soup = soup(page_html, "html.parser")

            # add each processor item container to a list of containers
            containers = page_soup.findAll("div", {"class": "item-container"})

            for container in containers:
                list = (containerScrape(container))
                csv_string = list[0] + "," + list[1] + "," + list[
                    2] + "," + list[3] + "," + list[4] + "," + list[
                        5] + "," + list[6]
                if descriptionlog.__contains__(list[1]):
                    print("Duplicate processor found. Not writing to list.")
                    duplicateCount = duplicateCount + 1
                else:
                    descriptionlog.append(list[1])
                    print(csv_string)
                    f.write(csv_string + "\n")
            containers.clear()

            if duplicateCount > 100:
                print()
                print(
                    "Duplicate Count Is " + str(duplicateCount) +
                    ". This Suggests The Data Is Being Reiterated. The Script Will Stop."
                )
                print("Processor Scrape Complete")
                print()
                print("Traversed " + str(page) + " Pages")
                print(
                    str(descriptionlog.__len__()) + " Unique Processors Found")
                print()
                print("Data Written To: " + f.name)
                f.close()
                break

            page = page + 1

        except IndexError as e:
            print()
            page = page + 1
            # f.close()
            print("So Far We Have Traversed " + str(page - 1) + " Pages")
            print(str(descriptionlog.__len__()) + " Unique Processors Found")
            print(str(duplicateCount) + " Duplicates Ignored")
from urllib.request import urlopen as UR
from bs4 import BeautifulSoup as soup
import re

URL_SIMP = 'https://www.newegg.com/Xbox-One-Systems/SubCategory/ID-3216'
URL_CLIENT = UR(URL_SIMP)
PAGE = URL_CLIENT.read()
URL_CLIENT.close()
PAGE_SOUP = soup(PAGE, "html.parser")
PAGE_CONTENT = PAGE_SOUP.findAll("div", {"class": "item-container"})

filename = "newegg.csv"
f = open(filename, "w")
headers = ("PRICE, SHIP, NAME\n")
f.write(headers)

for CONTENT in PAGE_CONTENT:

    PRICE_DATA = CONTENT.findAll("li", {"class": "price-current"})
    PRICE = PRICE_DATA[0].text
    PRICE_SIMP = re.sub("[^\d\.]", "", PRICE)

    SHIP_CONTENT = CONTENT.findAll("li", {"class": "price-ship"})
    SHIP = SHIP_CONTENT[0].text.strip()

    NAME = CONTENT.img["title"]

    print("Price: " + "$" + PRICE_SIMP)
    print("Shipping Cost: " + SHIP)
    print("Name of Product: " + NAME)
    print("\n")
Exemple #10
0
from urllib.request import urlopen as Ureq
from bs4 import BeautifulSoup as soup

my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'

# opening up connection, grabbing the page
Uclient = Ureq(my_url)

#it offloads the content into a variable
page_html = Uclient.read()

#close the connection
Uclient.close()

# html parsing
page_soup = soup(page_html, "html.parser")

# grabs each product
containers = page_soup.findAll("div", {"class": "item-container"})

for container in containers:
    brand = container.div.div.a.img["title"]
    
    title_container = container.findAll("a",{"class":"item-title"})
    product_name = title_container[0].text

    shipping_container = container.findAll("li",{"class":"price-ship"})
    shipping = shipping_container[0].text.strip()

    print("brand: " + brand)
    print("product_name: " + product_name)
Exemple #11
0
    def scrape():
        ####################################################################################
        concat = Sentry.get()
        #my_url = "file:///C:/Users/Adam-22-26/Desktop/graphics%20card%20-%20Newegg.com.html"
        my_url = 'https://www.newegg.com/global/ph-en/p/pl?d={}'.format(concat)
        my_url = my_url.replace(' ', '+')
        ####################################################################################
        uClient = Ureq(my_url)

        page_html = uClient.read()
        uClient.close()
        #html_parsing
        page_soup = Soup(page_html, "html.parser")
        #grabe each
        containers = page_soup.findAll("div", {"class": "item-container"})

        #manufacturer = page_soup.findAll("label",{"class": "form-checkbox"})
        #print(manufacturer )
        #print(len(containers))
        #print(containers[5:])
        #container = containers[5]
        #---------------------------------------- save the csv files
        fileName = "{}.csv".format(
            concat)  ###############################################

        f = open(fileName, "w")
        headers = "BRAND     , PRICES    ,  SAVES    , TITLES   , LINK    \n"  #
        f.write(headers)

        for container in containers[4:]:
            #---------------------------------------------------------
            brand_container = container.findAll("a", {"class": "item-brand"})
            brand = brand_container[0].img["title"]  #brand name

            #-------------------------------------------------------------------
            may_know = container.findAll("a", {"class": "item-title"})
            #print(may_know)

            ####################################################################
            title = container.a.img["title"]  #Name of selling
            #print(container)
            #######################################################3
            hyper = brand_container[0]["href"]
            #hyper = container.findAll("div",{"class": "item-info"})
            #hyper = hypers.a
            #print(hyper)
            #--------------------------------------------------------------
            price_container = container.findAll("li",
                                                {"class": "price-current"})
            price_container2 = price_container[0].strong
            price = re.findall(r'.\d.\d\d\d', str(price_container2))
            prices = ''.join(price)
            #------------------------------------------------------------------------
            save_container = container.findAll("span",
                                               {"class": "price-save-percent"})
            save = re.findall(r'\d\d.', str(save_container))
            saves = ''.join(save)

            if saves == '':
                saves = "None"
            else:
                saves = saves
            if prices == "":
                prices = "Not Available"
            else:
                prices = prices

            brandlistbox.insert(END, " :   " + brand)
            pricelistbox.insert(END, "₱ " + prices)
            savelistbox.insert(END, saves)
            Listbox4.insert(END, " :   " + title)
            hyperlink.insert(END, '  ' + hyper)
            #-------------------------------------------------------------------------

            f.write(
                brand.replace(',', '') + ", " + prices.replace(
                    ',', '.').replace('0', '1').replace('>', '    ') + ',' +
                saves.replace('', '').replace('None', '0%') + ', ' +
                title.replace(',', '') + ', ' + hyper + "\n")

        f.close()
        new_win = Button(window,
                         width=10,
                         text="New_Win",
                         command=mainwindow,
                         height=1,
                         font="Jokerman",
                         relief=RAISED,
                         activebackground="LightBlue1",
                         background='sky blue')
        new_win.place(x=105, y=90)
        messagebox.showinfo("Happens", "DONE! \n press ok to proceed")
from urllib.request import urlopen as Req
from bs4 import BeautifulSoup as soup


f = open('data.csv', 'w')

url = 'https://www.dicksmith.co.nz/dn/shop/phones/iphone/?page=1'

Client = Req(url)
page_html = Client.read()
Client.close()

page_soup = soup(page_html, 'html.parser')

containers = page_soup.find_all('div', class_='_1umis')
#container = containers[0]

f.write('Phone, Rating, Price \n')

for container in containers:
    Phone = container.find('a', itemprop='url').text
    if container.find('meta', itemprop='ratingValue') == None:
        Rating = 'no rating'
    else:
        Rating = container.find('meta', itemprop='ratingValue')['content']
    Price = container.find('span', itemprop='price')['content']
    f.write(Phone.replace(',',' |') + ', '+ Rating+', '+ Price + "\n")
    print(Phone.replace(',',' |') + ', '+ Rating+','+ Price.replace(',','') + "\n")

f.close()
def GetUrl(url):
    Uclient = Req(url)
    page_html = Uclient.read()
    Uclient.close()
    page_soup = Soup(page_html, "html.parser")
    return page_soup
Exemple #14
0
# Declare my_url variable

my_url = "https://www.carfax.com/Used-Honda-Civic-Type-R_t10063"

# Load my_url contents into Scrapee variable

Scrapee = Req(my_url)

# Extract html to variable Scrapee_html

Scrapee_html = Scrapee.read()

# Close web page

Scrapee.close()

# Parse html into node tree and strip html tags, store as variable Scrapee_soup

Scrapee_soup = soup(Scrapee_html, "html.parser")

#Find matching class data and store into three variables

Scrapee_soup_model = Scrapee_soup.findAll(
    "span", {"class": "srp-list-item-basic-info-model"})

Scrapee_soup_price = Scrapee_soup.findAll("span",
                                          {"class": "srp-list-item-price"})

Scrapee_soup_location = Scrapee_soup.findAll(
    "div", {"class": "srp-list-item-dealership-location"})
Exemple #15
0
def fetchPage(params={}):
    get = params.get
    link = get("link")
    ret_obj = {}
    if get("post_data"):
        log("called for : " + repr(params['link']))
    else:
        log("called for : " + repr(params))

    if not link or int(get("error", "0")) > 2:
        log("giving up")
        ret_obj["status"] = 500
        return ret_obj

    if get("post_data"):
        if get("hide_post_data"):
            log("Posting data")
        else:
            log("Posting data: " + urlencode(get("post_data")))

        request = HTTPRequest(link, urlencode(get("post_data")))
        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
    else:
        log("Got request")
        request = HTTPRequest(link)

    if get("headers"):
        for head in get("headers"):
            request.add_header(head[0], head[1])

    request.add_header('User-Agent', USERAGENT)

    if get("cookie"):
        request.add_header('Cookie', get("cookie"))

    if get("refering"):
        request.add_header('Referer', get("refering"))

    try:
        log("connecting to server...")

        con = OpenRequest(request)
        ret_obj["header"] = con.info()
        ret_obj["new_url"] = con.geturl()
        if get("no-content", "false") == u"false" or get(
                "no-content", "false") == "false":
            inputdata = con.read()
            ret_obj["content"] = inputdata.decode("utf-8")

        con.close()

        log("Done")
        ret_obj["status"] = 200
        return ret_obj

    except HTTPError as e:
        err = str(e)
        log("HTTPError : " + err)
        log("HTTPError - Headers: " + str(e.headers) + " - Content: " +
            e.fp.read())

        params["error"] = str(int(get("error", "0")) + 1)
        ret = fetchPage(params)

        if not "content" in ret and e.fp:
            ret["content"] = e.fp.read()
            return ret

        ret_obj["status"] = 500
        return ret_obj
Exemple #16
0
from urllib.request import urlopen as UReq
from bs4 import BeautifulSoup as beau

myurl = 'https://campinascomprelocal.com.br/tipo/bares/'
print(myurl)

# open connection page
uClient = UReq(myurl)
page_html = uClient.read()
uClient.close()

soup = beau(page_html, 'lxml')

contents = soup.title
print(contents)
                              eg. Google Candybar
		    The webpage will be opened in your laptop 
	          Send "Close" , Chrome browser will be killed.
"""
from urllib.request import urlopen as URL
import urllib.error as err
import webbrowser
import os
my_api = "647242931:AAG7wgAy4Fn-IWrgKn0RskeSnrowZcv6AOc"
temp = ""
while True:
    open_URL = 'https://api.telegram.org/bot' + my_api + '/getupdates'
    try:
        client_URL = URL(open_URL)
        data_URL = client_URL.read().decode('utf-8')
        client_URL.close()
        getstring = data_URL.split("{")
        getdata = getstring[-1].split("\"")
        data = getdata[-2]
        if (data == temp):
            pass
        else:
            print("data received:", data)
            temp = data
            if (data[0:6] == "google" or data[0:6] == "Google"):
                url = "http://www.google.co.in/search?q=" + data[6:]
                webbrowser.open(url)
            elif (data[0:5] == "close" or data[0:6] == "Close"):
                print("Closed")
#os.system('TASKKILL /F /IM chrome.exe')#works for windows
            else: