コード例 #1
0
 def scrap(self):
     url = 'https://finance.naver.com/item/sise_day.nhn?code={item}'.format(item=self.item)
     uo(url)
     #class = tah p11
     soup = BeautifulSoup(uo(url), "html.parser")
     for i in soup.find_all(name="span", attrs=({"class":"tah"})):
         print(str(i.text))
コード例 #2
0
def get_movie_list(substr):
    url = 'https://jsonmock.hackerrank.com/api/movies/search/'
    # opening connection reading the page
    uClient = uo(url)
    json_movie = uClient.read()
    uClient.close()
    # to work with json
    parsed_json = json.loads(json_movie)
    tot_page = parsed_json["total_pages"]
    # movie name to search
    search_movie = substr
    movie_list = parsed_json['data']
    # creating a array name title
    title = []

    for pageNo in range(0, tot_page):
        mod_url = 'https://jsonmock.hackerrank.com/api/movies/search/?Title=' + str(
            search_movie) + '&page=' + str(pageNo)
        uClient = uo(mod_url)
        json_searchMovie = uClient.read()
        uClient.close()
        #to work with json
        json_movie = json.loads(json_searchMovie)
        movie_list = json_movie['data']

        for i in range(0, len(movie_list)):
            title.append(movie_list[i]['Title'])
            #print(movie_list[i]['Title'])

    return sorted(title)
コード例 #3
0
ファイル: scrape.py プロジェクト: mazeen1998/News-Bite
 def sc1(self):
     pg = r(
         homePage.url2,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img alt="Times of India" class="img-fluid" src="https://static.mediawire.in/brands/profilepic/1117/TOI%20Logo%20in%20Red%20Bakcground.jpg">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgs1 = soup.find('div', {
             'class': 'listing4 clearfix'
         }).find('ul').findAll('li')
         for i in imgs1:
             heads.append(i.find('span').find('a').text)
             links.append(i.find('span').find('a').get('href'))
             imgs.append(i.find('a').find('img').get('data-src'))
             logos.append(logourl)
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
コード例 #4
0
ファイル: scrape.py プロジェクト: mazeen1998/News-Bite
 def sp(self):
     # pg=r(url1,{'User-Agent':'Magic Browser'})
     pg = r(
         sportPage.url1,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img class="img-fluid" src="https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg" title="The Indian Express" alt="The Indian Express">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgsl = soup.find('div', {
             'class': 'nation'
         }).findAll('div', {'class': 'snaps'})
         headsl = soup.find('div', {
             'class': 'nation'
         }).findAll('h2', {'class': 'title'})
         for i in imgsl:
             links.append(i.find('a').get('href'))
             logos.append(logourl)
             imgs.append(i.find('img').get('data-lazy-src'))
         for i in headsl:
             heads.append(i.find('a').text)
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
コード例 #5
0
ファイル: scrape.py プロジェクト: mazeen1998/News-Bite
 def tc2(self):
     pg = r(
         techPage.url3,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img class="img-fluid" src="https://cdn.gadgets360.com/gadgets360_logo.png" alt="Technology News" title="NDTV Gadgets 360">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgsl = soup.find('div', {
             'class': 'story_list row margin_b30'
         }).findAll('div', {'class': 'thumb'})
         for i in imgsl:
             if i.find('img').get(
                     'src'
             ) == "https://gadgets.ndtv.com/static/icons/img_120n.png":
                 imgs.append(i.find('img').get('data-original'))
             else:
                 imgs.append(i.find('img').get('src'))
             links.append(i.find('a').get('href'))
             logos.append(logourl)
             heads.append(i.find('img').get('alt'))
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
コード例 #6
0
ファイル: scrape.py プロジェクト: mazeen1998/News-Bite
 def tc(self):
     pg = r(
         techPage.url1,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img class="img-fluid" src="https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg" title="The Indian Express" alt="The Indian Express">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgsl = soup.find('ul', {'class': 'article-list'}).findAll('li')
         for i in imgsl:
             imgs.append(i.find('img').get('src'))
             links.append(i.find('a').get('href'))
             logos.append(logourl)
             heads.append(i.find('img').get('alt'))
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
コード例 #7
0
ファイル: scrape.py プロジェクト: mazeen1998/News-Bite
 def ec(self):
     # pg=r(url1,{'User-Agent':'Magic Browser'})
     pg = r(
         ecoPage.url1,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img class="img-fluid" src="https://www.financialexpress.com/wp-content/themes/vip/financialexpress/assets/images/fe-logo-with-read-to-lead.svg" alt="Financial Express">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgl1 = soup.find('div', {'class': 'leftcol'}).findAll('figure')
         titles1 = soup.find('div', {'class': 'leftcol'}).findAll('h2')
         titles2 = soup.find('div', {'class': 'leftcol'}).findAll('h3')
         for i in imgl1:
             imgs.append(i.find('img').get('data-src'))
             links.append(i.find('a').get('href'))
             logos.append(logourl)
         for i in titles1:
             heads.append(i.find('a').text)
         for i in titles2:
             heads.append(i.find('a').text)
         news = zip(imgs, heads, links, logos)
         return news
     except:
         news = []
         return news
コード例 #8
0
def fetch_url(url):
    try:
        web_client = uo(url)
        if web_client.getcode() == 504:
            raise ConnectionError('gateway timeout')
        web_page = web_client.read()
    except urllib.error.HTTPError as e:
        raise ConnectionError('http error occurred')
    except urllib.error.URLError as e:
        raise ConnectionError('http error occurred')
    else:
        web_client.close()
        return web_page
コード例 #9
0
def scraping ():
    global TREND_DATA
    uClient = uo(main_url)
    forum_page = uClient.read()
    uClient.close()
    pageSoup = soup(forum_page, "html.parser")

    trendListContainer = pageSoup.findAll("div",{"class":"trend-card"})
    for i in trendListContainer :
        trendTime = i.find("h5").text
        trendList = i.findAll("li")
        for j in trendList :
            TREND_DATA.append(j.text)
    print("=========MOMMENT TRENDING SCRAPE==========")
    FindMostTrending()
コード例 #10
0
def fetch_words(url):
    """
    Fetch a list of words from a URL
    
    Args:
        url: The URL of a UTF-8 text document
    
    Returns:
        story_words: A list of words
    """
    story = uo(url)
    story_words = []
    for line in story:
        line_words = line.decode('utf8').split()
        for word in line_words:
            story_words.append(word)
    story.close()
    return story_words
コード例 #11
0
def getURL(page):
     global  containerAmount
     containerAmount = 0
     for counter in range(page):
        my_url = 'https://store.steampowered.com/search/?specials=1&page=' + str(counter)
        print(counter)
        
        # open up the connection, grab the web page and basically download
        uPage = uo(my_url)
        my_html = uPage.read() # dump everything out of the website
        uPage.close()
        
        # html parsing
        global pageParse
        pageParse = soup(my_html, 'html.parser')
        
        #grab each product
        global myContainers
        myContainers = pageParse.find_all("div", {'class', 'responsive_search_name_combined'}) # grab one game total information
        containerAmount +=len(myContainers) # tota amount of games in the current page
        getPNamePriceReview()
        getPlatform()
        getRating()
コード例 #12
0
ファイル: scrape.py プロジェクト: mazeen1998/News-Bite
 def tc1(self):
     pg = r(
         techPage.url2,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img src="https://akm-img-a-in.tosshub.com/indiatoday/../sites/all/themes/itg/logo.png?v=1.3" alt="India Today" class="img-fluid">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgs1 = soup.find('div', {
             'class': 'view-content'
         }).findAll('div', {'class': 'catagory-listing'})
         for i in imgs1:
             imgs.append(
                 i.find('div', {
                     'class': 'pic'
                 }).find('img').get('src'))
             heads.append(i.find('div', {'class': 'detail'}).find('a').text)
             links.append(
                 i.find('div', {
                     'class': 'detail'
                 }).find('a').get('href'))
             logos.append(logourl)
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
コード例 #13
0
ファイル: scrape.py プロジェクト: mazeen1998/News-Bite
 def sc2(self):
     pg = r(
         homePage.url3,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img src="https://www.cs.utah.edu/~deb/assets/images/media/logo_it.png" alt="India Today" class="img-fluid">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgs1 = soup.find('div', {
             'class': 'view-content'
         }).findAll('div', {'class': 'catagory-listing'})
         for i in imgs1:
             imgs.append(
                 i.find('div', {
                     'class': 'pic'
                 }).find('img').get('src'))
             heads.append(i.find('div', {'class': 'detail'}).find('a').text)
             links.append(
                 i.find('div', {
                     'class': 'detail'
                 }).find('a').get('href'))
             logos.append(logourl)
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
コード例 #14
0
import bs4
from bs4 import BeautifulSoup as soup
import lxml
from urllib.request import urlopen as uo

# Declaring the url and getting the full webpage using urllib.
url = input("Enter a link to the topic's WikiPedia page : ")
uclient = uo(url)
page_html = uclient.read()
uclient.close()

#Using beautiful soup to parse the html page into required form
page_soup = soup(page_html, "html.parser")
# Finding all paragraphs in "bodyContent" div's of the "content" div.
datadump = page_soup.body.find("div", {
    'id': 'content'
}).find("div", {
    'id': 'bodyContent'
}).find_all("p")

#filtering out all the reference tags in the wikipedia page
for udd in datadump:
    m = udd.find_all("sup")
    if m != None:
        for i in m:
            i.string = ""

# Printing the collected info.
for dd in datadump:
    onlyTextDump = dd.get_text()
    print(onlyTextDump)
コード例 #15
0
import sys
# 不推荐的写法
import sys, os

# 正确的写法
from subprocess import Popen, PIPE
from urllib.request import *

# 正确的写法
from urllib.request import urlopen

# 给引入的模块或者函数取别名
# import numpy as np
from urllib.request import urlopen as uo

uo("http://www.baidu.com")
"""
常用内置模块
https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/0014319347182373b696e637cc04430b8ee2d548ca1b36d000
"""
"""
安装第三方模块
https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/001432002680493d1babda364904ca0a6e28374498d59a7000
"""

print("---------------------------1-----------------------------")

for x in "huangbo":
    print(x)

ddd = {'x': 'A', 'y': 'B', 'z': 'C'}
コード例 #16
0
ファイル: AnchorParser.py プロジェクト: wsdzl/PythonCode
    # 直接执行实例对象可获取解析结果列表
    def __call__(self):
        self.feed(self.html)
        return self.data

    def handle_starttag(self, tag, attrs):
        if (not self.static_res) and (tag != 'a'):
            return
        prop = ('href', 'src')
        for attr in attrs:
            if attr[0] in prop:
                link = attr[1]
                if link.startswith('mailto:') or link.startswith(
                        'javascript:'):
                    return  # 跳过mailto和javascript链接
                seek = link.find('#')
                if seek != -1:
                    link = link[:seek - len(link)]  # 去掉锚链接
                if link:  # 过滤空链接
                    if self.url:
                        link = urljoin(self.url, link)  # 合并到html页面url
                    while link.endswith('/'):
                        link = link[:-1]
                    self.data.append(link)


if __name__ == '__main__':
    url = 'http://www.baidu.com'
    from urllib.request import urlopen as uo
    parser = AnchorParser(uo(url).read(), url, static_res=True)
    print(parser())
コード例 #17
0
from urllib.request import urlopen as uo
from bs4 import BeautifulSoup as bs

link = 'http://results.vtu.ac.in/results17/result_page.php?usn=1rn14cs'
t = link
#.csv file writer
f = open("Res.csv", "w")
f.write("NAME" + "," + "USN" + "," + "MARKS" + "\n")

#link = link + '078'
for i in range(0, 400):
    link = link + str(i).zfill(3)
    print(link)
    results_client = uo(link)
    results_html = results_client.read()
    results_client.close()
    r_soup = bs(results_html, "html.parser")
    try:
        usn = r_soup.findAll("div", {"class": "col-md-12"})[3].findAll(
            "td", {"style": "padding-left:15px;text-transform:uppercase"
                   })[0].text.replace(":", "").strip()
        name = r_soup.findAll("div", {"class": "col-md-12"})[3].findAll(
            "td", {"style": "padding-left:15px"})[0].text.replace(":",
                                                                  "").strip()
        marks = r_soup.findAll(
            "table", {
                "style":
                "margin-left:30px;margin-bottom:5px;font-family:Times New Roman;font-size:12pt;"
            })[0].findAll("td",
                          {"style": "padding-left:10px"})[0].b.text.replace(
                              ":", "").strip()
コード例 #18
0
def get_tables(url):
    return [to_df(t) for t in bs(uo(url), 'html.parser').find_all('table')]
コード例 #19
0
from urllib.request import urlopen as uo
from bs4 import BeautifulSoup as soup

# make scraper appear to website as a legitimate browser
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

url = "http://lottoresults.co.nz/lotto/archive"

# open connections, read NZ Lotto archive webpage, close connection
client = uo(url)
webpage = client.read()
client.close()

# html parsing
webpage_soup = soup(webpage, "html.parser")

# grabs url for every month in every year and appends to an array
extract_months = webpage_soup.findAll("ul", {"class":""})
all_months = str(extract_months)
end_urls = []
i = 1
for month in all_months.split('"'):
    if (i % 2 == 0):
        end_urls.append(month)
    i+=1
コード例 #20
0
from urllib.request import urlopen as uo

with uo('http://sixty-north.com/c/t.txt') as story:
    words = []
    for line in story:
        
コード例 #21
0
Start_Row= 1

#Call Ticker Loop
Ticker_Found= loop_through_excel(Start_Row, Total_Rows)

#Creates Panda with Ticker Symbols and Names from an Excel Spreadsheet.
Ticker_List= 'Ticker_List.xlsx'
Tickers= pd.read_excel(Ticker_List,
                       header=0,
                       index_col=False,
                       keep_default_na=True)

#Searches, Parses, and then locates Market Cap from Yahoo Finance.
#html = uo('https://finance.yahoo.com/quote/AAPL/key-statistics?p=AAPL')
for i in Tickers:
       html = uo('https://finance.yahoo.com/quote/'+ i + '/key-statistics?p=' + i + "'")
       print(i)
read= bs(html.read(),'html.parser')
MarketCap= read.find('td', {'class':'Fz(s) Fw(500) Ta(end) Pstart(10px) Miw(60px)'})

print(MarketCap.get_text())
#Prints Top 5 Dataframe results and Market Cap from Yahoo Finance.
print(Tickers.head())
   
    
    
    
    
    
    
    
コード例 #22
0
from bs4 import BeautifulSoup
from urllib.request import Request as R, urlopen as uo

url = "http://synd.cricbuzz.com/j2me/1.0/livematches.xml"
h = {'User-Agent': ''}

req = R(url, headers=h)
resp = uo(req)

xml = BeautifulSoup(resp, "xml")

for val in xml.find("mchdata"):
    print(val)
コード例 #23
0
import bs4
from urllib.request import urlopen as uo
from bs4 import BeautifulSoup as soup

my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38'
# my_url = 'https://twitter.com/picoiyer'

uCLient = uo(my_url)
page_html = uCLient.read()
uCLient.close()

# parsing html here
page_soup = soup(page_html, 'html.parser')

#grab parts of page
containers = page_soup.findAll("div", {"class": "item-container"})
# print(page_soup.p, page_soup.h1)
# print(containers[0])

for container in containers:
    # print(container)
    brand = container.div.div.a.img["title"]
    title_container = container.findAll('a', {"class": "item-title"})
    # print(title_container)
    product_name = title_container[0].text
    shipping = container.findAll('li', {"class": "price-ship"})[0].text.strip()

    print("Brand: {}".format(brand))
    print("product Name: {}".format(product_name))
    print("Shipping: {}".format(shipping))