Python beauの例、bs4.beau Pythonの例

コード例 #1

0

ファイルを表示

ファイル: a,py.py プロジェクト: zyzain/python_zeroing-

def parge_url(url):
    with open(csv_name,'a',newline='',encoding='gbk') as f:#wb新建
        writer = csv.writer(f)
        response =requests.get(url,headers =headers)
        res =beau(response.text,'lxml')
        print(response.status_code)
        for i in res.select('#comments > div.comment-item'):
            try:
                mid = beau(str(i),'lxml')#中间再次解析
                name = mid.select('span.comment-info a')[0].text
                star = re.findall('allstar(.*?) rating',str(i))
                time =mid.find_all(class_ ='comment-time')[0].get('title').strip('')
                comment = mid.select('p span.short')[0].text
                img_url =mid.select('div.avatar a img')[0].get('src')
                list =[]
                list.append(name)
                list.append(star)
                list.append(time)
                list.append(comment)
                list.append(img_url)
                print(list)
                try:
                    writer.writerow(list)
                except:
                    print('数据请求失败*************************')
                    pass
            except:
                print('数据解析失败-----------')

コード例 #2

0

ファイルを表示

ファイル: raw_data_html_download.py.py プロジェクト: dohri/SOEN-6611

def download_bug(len1, temp_bugid):
    while (len1 != 0):
        l1 = temp_bugid  #use temp list l1
        print "start of while loop"
        temp_bugid = []  #flush global list
        for i in l1:
            bugnumber = i + '.html'
            url = 'https://code.google.com/p/chromium/issues/detail?id=' + i
            try:
                page = urllib2.urlopen(url)
                #htm = urllib2.urlopen(url)
                soup = beau(page)
                #bugnumber = bugno + '.html'
                print bugnumber
                s = open(bugnumber, 'w')
                s.write(str(soup))
                #time.sleep(3)
            except:  #except with no arguments will catch the exception and pass it and move onto next iteration
                print "bugnumber = %s" % (bugnumber)
                temp_bugid.append(
                    i
                )  #append empty list with all bugs that were not downloaded for which
                # there was error in try and exception was raised then iterate while on left length until its is 0
        #one can catch Httpsresponse exception as Urllib2.HTTpsreponse
        print temp_bugid
        len1 = len(temp_bugid)
        print "length of temp_bugid = %s" % len1

コード例 #3

0

ファイルを表示

def droid(url,root):
    htm = urllib2.urlopen(url) #open droid url 
    soup  = beau(htm)          #feed all the data from html page of droid to var named soup
    elm = soup.findAll('a')    #find all "a" tags

    lst = []
    for i in elm:
        x = i.attrs["href"]    #for all a tags find all attributes with hyperlink name
        lst.append(x)          #for all given href feed the value of that in a list
#    print(i.attrs["href"])

#now there are 2 hrefs in on the fdroid page both have github the one ending with issues is not the string we want
    lis = []
    for i in lst:
        if "github" in i:
            if not re.search(r"issues$",i): #if string does not have issue appent to list
                lis.append(i)
    temp = lis[-1] 
    if re.search(r"/$",temp):
        temp = temp[:-1]
        print(temp)
        gitclone(temp,root)                 #feed the url fecthed to gitclone function to perform clone
    else:
        print(temp)
        gitclone(temp,root)

コード例 #4

0

ファイルを表示

ファイル: get_atlassian_bugs1.py プロジェクト: dohri/SOEN-6611

def download_bug(bugno, url):
    web_page = urllib2.urlopen(url)
    #		htm = urllib2.urlopen(url)
    soup = beau(web_page)
    bugnumber = bugno + '.html'
    print bugnumber
    s = open(bugnumber, 'w')
    s.write(str(soup))

コード例 #5

0

ファイルを表示

    def __parse_news(self, rss_html_content: str):
        content = {}
        soup = beau(rss_html_content, "html.parser")

        # clean the last line
        soup.find_all("p")[-1].decompose()
        print("rss_Handler")
        img_url = re.split("(-)\d+(x)\d*", soup.find("img").attrs["src"])
        content["img_url"] = img_url[0] + img_url[-1].split("?")[0]
        txt = soup.text.rstrip()
        content["txt"] = self.__split_news(txt)
        return content

コード例 #6

0

ファイルを表示

def github(url,root):
    htm = urllib2.urlopen(url)
    soup  = beau(htm)
    elms = soup.select("h3.repo-list-name a")

    lst = []
    for i in elms:
        x = i.attrs["href"]
#    print(i.attrs["href"])
        lst.append(x)
#    lst = lst.append(x)
#print('lst: ',lst)
    for git in lst:
        git ='https://github.com' + git + '.git'
        gitclone(git,root)
        print(git)

コード例 #7

0

ファイルを表示

ファイル: raw_data_html_download.py.py プロジェクト: dohri/SOEN-6611

def bugid(url):
    htm = urllib2.urlopen(url)
    soup = beau(htm)
    elm = soup.findAll("td", {"class": "vt id col_0"})
    x = 0

    temp = []
    for i in elm:
        alink = i.findAll("a")
        for i in alink:
            str1 = i.text
            str1 = str1.replace('\n', '')
            str1 = str1.replace(' ', '')
            temp_bugid.append(str1)

    temp = temp_bugid[0:496]
    temp3 = [x for x in temp_bugid if x not in temp]
    temp4.extend(temp3)

コード例 #8

0

ファイルを表示

ファイル: Crawler.py プロジェクト: zyzain/python_zeroing-

    def crawl_kuaidaili(self):
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Referer': 'https://www.google.com/',
            'User-Agent': random.choice(user_list)
        }

        for i in range(5):
            url = 'https://www.kuaidaili.com/free/inha/{}/'.format(str(i))
            time.sleep(2)
            res = requests.get(url, headers=headers)
            soup = beau(res.text, 'lxml')
            for j in soup.select('div > table > tbody > tr'):
                ip = re.findall('<td data-title="IP">(.*?)</td>', str(j))[0]
                port = re.findall('<td data-title="PORT">(.*?)</td>',
                                  str(j))[0]
                yield ':'.join([ip, port])

コード例 #9

0

ファイルを表示

ファイル: Crawler.py プロジェクト: zyzain/python_zeroing-

 def crawl__66(self):
     headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
         'Accept-Encoding': 'gzip, deflate, br',
         'Accept-Language': 'zh-CN,zh;q=0.9',
         'Cache-Control': 'max-age=0',
         'Connection': 'keep-alive',
         'Referer': 'http://www.66ip.cn/2.html',
         'User-Agent': random.choice(user_list)
     }
     for i in range(5):
         url = 'http://www.66ip.cn/{}.html'.format(str(i))
         time.sleep(2)
         res = requests.get(url, headers=headers)
         soup = beau(res.text, 'lxml')
         for j in soup.select('div.containerbox > div > table > tr'):
             a = re.findall('<tr><td>(.*?)</td><td>', str(j))[0]
             if a == 'ip':
                 pass
             else:
                 port = re.findall('</td><td>(.*?)</td><td>', str(j))[0]
                 yield ':'.join([a, port])

コード例 #10

0

ファイルを表示

from urllib.request import urlopen as UReq
from bs4 import BeautifulSoup as beau

myurl = 'https://campinascomprelocal.com.br/tipo/bares/'
print(myurl)

# open connection page
uClient = UReq(myurl)
page_html = uClient.read()
uClient.close()

soup = beau(page_html, 'lxml')

contents = soup.title
print(contents)

コード例 #11

0

ファイルを表示

ファイル: beauload_002.py プロジェクト: motovmp1/beautiful_02

import sys

from bs4 import BeautifulSoup as beau

soup = beau(open('/home/elsys/PycharmProjects/beautiful_01/bares.html'),
            "lxml")

filename = "bares.csv"
f = open(filename, "w")

headers = "ENDERECO, NUMERO, BARES\n"
f.write(headers)

for title in soup.findAll('div',
                          {"class": "jet-listing-dynamic-field__inline-wrap"}):
    print(title.text + ",")
    f.write(title.text + "\n")

f.close()

コード例 #12

0

ファイルを表示

ファイル: get_atlassian_bugs1.py プロジェクト: dohri/SOEN-6611

bug_end = bug_end.split("bug_end = ")
for i in bug_end:
    bug_end1 = i
bug_end1 = bug_end1.rstrip()
#print(bug_end1)

max_timeout_secs = max_timeout_secs.split("max_timeout_secs = ")
for i in max_timeout_secs:
    max_timeout_secs1 = i
max_timeout_secs1 = max_timeout_secs1.rstrip()
#print(max_timeout_secs1)

htm = urllib2.urlopen(url2)
#htm = urllib2.urlopen("https://f-droid.org/repository/browse/?fdid=uk.org.crimetalk")
soup = beau(htm)
#s = open('web.html','w')
#print(e)
elm = soup.findAll('a')

lst = []
for i in elm:
    x = i.attrs["href"]
    lst.append(x)
#    print(i.attrs["href"])
#print(lst)
temp = []
for i in lst:
    if "/browse/" in i:
        temp.append(i)
t1 = []

コード例 #13

0

ファイルを表示

ファイル: ketqua.py プロジェクト: thanhlankool1/Python-pymi-9-12

- requests
- requests_html hay beautifulsoup4 [tuỳ chọn]
- argparse hay sys.argv

Gợi ý:

- ``nargs`` https://docs.python.org/2/library/argparse.html
'''

import requests
import sys
from bs4 import BeautifulSoup as beau
ses = requests.Session()
resp = ses.get('https://ketqua.net')
data = resp.text
data_format = beau(data, 'html.parser')
list_bingo = []
index_special = data[23047:23049]


# print(data_format.prettify())
def get_list_bingo():
    for i in data_format.find_all('td', class_='chu17 need_blank'):
        if i.text.isdigit():
            list_bingo.append(i.text)
            # print(i.text)
    # print(list_bingo)


get_list_bingo()

コード例 #14

0

ファイルを表示

import sys

from bs4 import BeautifulSoup as beau

soup = beau(open('/home/elsys/PycharmProjects/beautiful_01/bares.html'),
            "html.parser")

filename = "bares3.csv"
f = open(filename, "w")

# headers = "ENDERECO, NUMERO, BARES\n"
# f.write(headers)

#
# for title in soup.findAll('div', {"class": "jet-listing-dynamic-field__inline-wrap"}):
#     print(title.text + ",")

# for title in soup.findAll('div', {"class": "jet-listing jet-listing-dynamic-field display-inline"}):
# for title in soup.findAll('div', {"class": "jet-listing-dynamic-field__content"}):

nome_bar = soup.findAll('div', {"class": "elementor-widget-container"})
# print(len(nome_bar))
# print(nome_bar[3].text)

filename = "bares3.csv"
f = open(filename, "w")

headers = "ENDERECO, NUMERO, BARES\n"
f.write(headers)

for list in nome_bar: