Ejemplo n.º 1
0
def check_url(url):
    html = urlopen(url).read().decode('utf-8')
    content = bts(html, features='lxml')
    result = True
    if "Item Withheld" in content.title.get_text():
        result = False
    return result
Ejemplo n.º 2
0
def getComment(aid):
    cid = getCid(aid)
    if cid < 0: return -1

    url = "https://comment.bilibili.com/" + str(cid) + ".xml"
    page = getData(url)
    soup = bts(page, "html.parser")
    comment = {}

    for c in soup.find_all('d'):
        time = float(c.attrs['p'].split(',')[0])
        comment[time] = c.string

    comment = sorted(comment.items(), key=lambda x: x[0])
    ret = [
        define.Comment(_time=x[0], _vtime=calcTime(x[0]), _content=x[1])
        for x in comment
    ]
    return ret
Ejemplo n.º 3
0
def crawler(x):
    import os
    import sys
    import urllib.request
    from bs4 import BeautifulSoup as bts
    from text_preprocessing import clean_text
    from lst_pick import lst_pick
    client_id = "IFvtovuLLdeQi6K6jywv"
    client_secret = "_51j6auaOC"
    encText = urllib.parse.quote(x)
    start = 1
    str_big = []
    while start < 1000:
        url = "https://openapi.naver.com/v1/search/news.xml?query=" + encText + \
            "&display=30" +  "&sort=date" + "&start=" + str(start)  # xml 결과
        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id", client_id)
        request.add_header("X-Naver-Client-Secret", client_secret)
        response = urllib.request.urlopen(request)
        rescode = response.getcode()
        if (rescode == 200):
            response_body = response.read()
            a = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)

        html = bts(a, "html.parser")
        news_titles = html.find_all("title")

        for title in news_titles:
            title_str = str(title.string)
            str_big.append(title_str.strip())
        start += 30

    try:
        pre_str_big = [clean_text(i) for i in str_big]
        data = {}
        for row in pre_str_big:
            for text in row.split():
                data[text] = data.get(text, 0) + 1
    except Exception as e:
        print("예외 발생", e)
    return data
# Test beautiful soup
from bs4 import BeautifulSoup as bts
import requests

# parse the html with lxml and beautifulsoup
# get the desire data from the html
with open('simple.html') as htmlFile:
    soup= bts(htmlFile, 'lxml')
# get the title
match = soup.title.text


# get the prefified html
prettifiedSoup = soup.prettify()


with open('simple.html') as htmlFile:
    soup= bts(htmlFile, 'lxml')
# Find the first return class named article
article = soup.find('div', class_='article')
    #print the headline 
headline= article.h2.a.text
print(headline)
    #print the summary
summary= article.p.text
print(summary)

# print the headline and summary of all article class

for i in soup.findAll("div", class_= 'article'):
    headline= article.h2.a.text
Ejemplo n.º 5
0
from urllib.request import urlopen
from urllib.error import HTTPError
import re
from bs4 import BeautifulSoup as bts

try:
    html = urlopen("https://baike.baidu.com/item/gooogle")
except HTTPError as e:
    print(e)
else:
    if html is None:
        print('None')
    else:
        bsobj = bts(html.read(), "html.parser")
        title = bsobj.h1.get_text()
        dtlist = bsobj.findAll("dt", {"class": "basicInfo-item name"})
        ddlist = bsobj.findAll("dd", {"class": "basicInfo-item value"})

        print(title + ':')
        for name, value in zip(dtlist, ddlist):
            pattern = re.compile(r'[^\u4e00-\u9fa5]')
            name = re.sub(pattern, '', str(name))
            print('\t' + name + ' : ' + value.get_text().strip())
Ejemplo n.º 6
0
 def __init__(self, SoupPage):
     self.serp = bts(SoupPage, 'lxml')
Ejemplo n.º 7
0
from bs4 import BeautifulSoup as bts
import re
import csv
from urllib.request import urlopen
import xlrd

count = 0
all_href = []
all_href_excel = []
#PLOS ONE
#url = "http://hub.hku.hk/simple-search?query=&location=publication&filter_field_1=journal&filter_type_1=equals&filter_value_1=plos+one&filter_field_2=dateIssued&filter_type_2=equals&filter_value_2=%5B2016+TO+2018%5D&sort_by=score&order=desc&rpp=25&etal=0&start=0";
#Scientific Reports
url = "http://hub.hku.hk/simple-search?query=&location=publication&rpp=25&sort_by=score&order=desc&filter_field_1=journal&filter_type_1=equals&filter_value_1=scientific+reports&filter_field_2=dateIssued&filter_type_2=equals&filter_value_2=%5B2016+TO+2018%5D"
while url is not None:
    html = urlopen(url).read().decode('utf-8')
    content = bts(html, features='lxml')

    pages = content.find('ul', 'pagination pull-right')
    #print (pages)
    next_url = None
    flag = False
    pages_refs = pages.find_all('li')
    for page_ref in pages_refs:
        if page_ref.has_attr('class'):
            flag = True
            continue
        if flag == True:
            next_url = page_ref.find('a')['href']
            break
    #print(next_url)
    if next_url is None:
Ejemplo n.º 8
0
import requests
from bs4 import BeautifulSoup as bts

url = requests.get("https://m.stock.naver.com/marketindex/index.nhn").text
soup = bts(url, 'html.parser')

for key in soup.select('.exchg_on'):
    name = key.select_one('.stock_dn').text
    # doll = key.select_one('.stock_price').text
    # gap = key.select_one('.gap_wrp').text
    print(f'{name}')
Ejemplo n.º 9
0
from bs4 import BeautifulSoup as bts
from selenium import webdriver

driver = webdriver.Firefox()
url = "https://www.lazada.vn/dien-thoai-di-dong/?page=1"
driver.get(url)
root = driver.find_element_by_id(id_='root')
# print(root.get_attribute('innerHTML'))
lzd = root.get_attribute('innerHTML')
soup = bts(lzd, 'lxml')
# print(soup.title.string)
products = soup.find_all(class_='c2prKC')
# print(products)
# print(lzd.text)
count = 0
for product in products:
	# if product.has_attr('class'):
	count += 1
print(count)
driver.close()
Ejemplo n.º 10
0
from urllib.request import urlopen
from bs4 import BeautifulSoup as bts
import re
import pandas as pd
import datetime

# wiki 网址
url = 'https://zh.wikipedia.org/wiki/%E6%B2%AA%E6%B7%B1300'

target_html = urlopen(url).read().decode('utf-8')

soup = bts(target_html,'lmxl')

target_table = soup.find('table')

CSI300_trs = target_table.find_all('tr')[1:]

target_data = []

for i in CSI300_trs:
	CSI300_tds = i.find_all('td')
	share_code =CSI300_tds[0].get_text()
	share_name = CSI300_tds[1].get_text()
	exchange = CSI300_tds[3].get_text()
	target_data.append([share_code, share_name, exchange])

CSI300_data =pd.DataFrame(target_data,colunms=['share_code', 'share_name', 'Exchange'])

date = datetime.date.today().strftime('%Y%m%d')

CSI300_data.to_csv(f'CSI300_{date}',encoding='utf_8_sig')