Python BeautifulSoupの例、bs4.BeautifulSoup Pythonの例

コード例 #1

0

ファイルを表示

ファイル: monitor金华市国土资源局.py プロジェクト: rhuangfinance/scrapy_proj

 def parse2(self, response):
     bs_obj = bs4.BeautifulSoup(response.text, 'html.parser')
     item = response.meta['item']
     item['parcel_status'] = 'sold'

コード例 #2

0

ファイルを表示

ファイル: ex5.py プロジェクト: u9113034/python

# ch21_27.py
import bs4, requests, re

url = 'http://www.taiwanlottery.com.tw'
html = requests.get(url)
print("網頁下載中 ...")
html.raise_for_status()                             # 驗證網頁是否下載成功                      
print("網頁下載完成")

objSoup = bs4.BeautifulSoup(html.text, 'lxml')      # 建立BeautifulSoup物件

dataTag = objSoup.select('.contents_box02')         # 尋找class是contents_box02
print("串列長度", len(dataTag))
for i in range(len(dataTag)):                       # 列出含contents_box02的串列                 
    print(dataTag[i])

pattern=r'\d+/\d+/\d+'
# 找尋開出順序與大小順序的球
balls = dataTag[0].find_all('div', {'class':'ball_tx ball_green'})
date = dataTag[0].find('span', {'class':'font_black15'})
datelist=re.findall(pattern,str(date))
print('威力彩開獎 日期: ' + datelist[0])
print("開出順序 : ", end='')
for i in range(6):                                  # 前6球是開出順序
    print(balls[i].text, end='   ')

print("\n大小順序 : ", end='')
for i in range(6,len(balls)):                       # 第7球以後是大小順序
    print(balls[i].text, end='   ')

# 找出第二區的紅球

コード例 #3

0

ファイルを表示

ファイル: webscrapping.py プロジェクト: dibyendu415/Solution_code

import requests
import bs4
ress = requests.get('https://loksabha.nic.in/')
soup = bs4.BeautifulSoup(ress.text, 'html.parser')
s = soup.find_all('div', {'class': 'update'})
for count in s:
    print(count.findChild('ul').text)

コード例 #4

0

ファイルを表示

ファイル: naver_news_db2.py プロジェクト: haidanliu19/Python

import urllib.request
import bs4
import time
import sqlite3

## 변수 선언 부분 ##
con, cur = None, None
data1, data2, data3 = "", "", ""
sql = ""

# 메인 코드 부분 ##
while True:
    url = "http://news.naver.com/"
    html = urllib.request.urlopen(url)

    bs_obj = bs4.BeautifulSoup(html, "html.parser")

    hdline_article_list = bs_obj.find("ul", {"class":"hdline_article_list"})
    lis = hdline_article_list.findAll("li")

    con = sqlite3.connect("../sqlite-tools-win32-x86-3300100/naverDB")
    cur = con.cursor()

    for li in lis:
        a = li.find("a")
        a.text.strip()

        data1 = "정치"  # 뉴스 구분
        data2 = a.text.strip()
        data2 = data2.replace("'", "\"")  # 뉴스 제목
        data3 = "ydgil"  # 작성자

コード例 #5

0

ファイルを表示

ファイル: selectors_test.py プロジェクト: riccardosven/webtraversallibrary

def test_selector_build():
    # Simple case
    source = """<html><body><div class="hi"></div></body></html>"""
    soup = bs4.BeautifulSoup(source, "html5lib")
    element = soup.body.select("div")[0]
    selector = Selector.build(soup, element)
    assert selector.css == "html>body>div"
    assert selector.xpath == "/html/body/div"
    selector = Selector.build(soup, element)
    assert selector.css == "html>body>div"
    assert selector.xpath == "/html/body/div"

    # Complex nesting
    source = """<html><body>
<a></a>
<div><a></a></div>
</body></html>"""
    soup = bs4.BeautifulSoup(source, "html5lib")
    elements = soup.body.select("a")
    selector = Selector.build(soup, elements[0])
    assert selector.css == "html>body>a"
    assert selector.xpath == "/html/body/a"

    selector = Selector.build(soup, elements[1])
    assert selector.css == "html>body>div>a"
    assert selector.xpath == "/html/body/div/a"

    # Deeply nested
    source = """<html><body>
<div class="a" wtl-uid="12"><div><div class="b"><div class="c"><div class="d"><div class="e"><div class="f">
<span>Hi</span>
</div></div></div></div></div></div></div>
<div class="a"><div><div class="b"><div class="c"><div class="d"><div class="f">
<span>Howdy</span>
</div></div></div></div></div></div></div>
</body></html>"""
    soup = bs4.BeautifulSoup(source, "html5lib")
    element = soup.body.select(".e span")[0]
    selector = Selector.build(soup, element)
    assert selector.css == "html>body>div:nth-of-type(1)>div>div>div>div>div>div>span"
    assert selector.xpath == "/html/body/div[1]/div/div/div/div/div/div/span"
    selector = Selector.build(soup, element)
    assert selector.css == "html>body>div:nth-of-type(1)>div>div>div>div>div>div>span"
    assert selector.xpath == "/html/body/div[1]/div/div/div/div/div/div/span"

    element = soup.body.select(".d > .f > span")[0]
    selector = Selector.build(soup, element)
    assert selector.css == "html>body>div:nth-of-type(2)>div>div>div>div>div>span"
    assert selector.xpath == "/html/body/div[2]/div/div/div/div/div/span"

    selector = Selector.build(soup, 12)
    assert selector.css == "html>body>div:nth-of-type(1)"
    assert selector.xpath == "/html/body/div[1]"

    # Unsafe names
    source = """<html><body>
<div:nonstandard><a></a></div>
</body></html>"""
    soup = bs4.BeautifulSoup(source, "html5lib")
    element = soup.body.select("a")[0]
    selector = Selector.build(soup, element)
    assert selector.css == "html>body>*>a"
    assert selector.xpath == "/html/body/*/a"

    # Invalid WTL-uid
    selector = Selector.build(soup, 23)
    assert selector.css == "bad_wtl_uid_no_matches"
    assert selector.xpath == "bad_wtl_uid_no_matches"

コード例 #6

0

ファイルを表示

ファイル: parse_html_deps.py プロジェクト: raquelct/mibot-updates

 def __init__(self, html):
   self._soup = bs4.BeautifulSoup(html, 'html5lib')
   self._inline_scripts = None
   self._scripts = None

コード例 #7

0

ファイルを表示

 def get_soup(self, url):
     ret = bs4.BeautifulSoup(req.urlopen(url).read(), 'lxml')
     req.urlopen(url).close()
     return ret

コード例 #8

0

ファイルを表示

gross = []          #19
    
mv_attributs =  names,years,imdb_ratings,metascores,votes,categories,mv_pages,genre1,genre2,genre3,stars1,stars2,stars3,rank,nb_oscar,win,nom,runtime,budget,gross


# TEST POSSIBILITIES :

page_link = "https://www.imdb.com/title/tt7286456/" #oscar win nom
#page_link = "https://www.imdb.com/title/tt0120903/" #win nom
#page_link = "https://www.imdb.com/title/tt6914122/" #nom
#page_link = "https://www.imdb.com/title/tt8201852/" #Empty
#page_link ="https://www.imdb.com/title/tt2017038/"  #1 Star


response = requests.get(page_link)
html = bs4.BeautifulSoup(response.text, 'html.parser')

nb_genre = 0
#get the movie genres
div = html.find('div', class_="subtext")
#test_genre = False
for a in div.find_all('a'):
    #test_genre = False
    title = a.get('title')
    #there is a balise title which we do not want
    if title is None:
        mv_attributs[7+nb_genre].append(a.text)
        #test_genre = True
        nb_genre += 1
if nb_genre == 1:
    mv_attributs[8].append(None)

コード例 #9

0

ファイルを表示

ファイル: Day7.py プロジェクト: shakibusman/python-course

-

comp_file = zipfile.Zipfile('comp_file.zip','w')

import requests
import bs4

result = requests.get("http://example.com")

type(result)

result.text

import bs4

soup = bs4.BeautifulSoup(result.text,"lxml")

soup

soup.select('title')[0].getText()

site_para = soup.select("p")

site_para[0]

res = requests.get('http://en.wikipedia.org/wiki/Grace_Hopper')

soup = bs4.BeautifulSoup(res.text,"lxml")

first_item = soup.select('.toctext')[0]

コード例 #10

0

ファイルを表示

ファイル: prettyfy.py プロジェクト: ArioA/Gossip

def prettify(raw_html: str):
    soup = bs4.BeautifulSoup(raw_html, 'html.parser')
    print(soup.prettify())

コード例 #11

0

ファイルを表示

def get_splits(
    playerid: str,
    year: Optional[int] = None,
    player_info: bool = False,
    pitching_splits: bool = False
) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict]]:
    """
    Returns a dataframe of all split stats for a given player.
    If player_info is True, this will also return a dictionary that includes player position, handedness, height, weight, position, and team
    """
    soup = get_split_soup(playerid, year, pitching_splits)
    # the splits tables on the bbref site are all within an embedded comment. This finds all the comments
    comment = soup.find_all(text=lambda text: isinstance(text, bs.Comment))
    data = []
    level_data = []
    for i in range(len(comment)):
        commentsoup = bs.BeautifulSoup(comment[i], 'lxml')
        split_tables = commentsoup.find_all("div",
                                            {"class": "table_container"})
        splits = [ele for ele in split_tables]
        headings = []
        level_headings = []
        for j in range(len(splits)):
            split_type = splits[j].find_all('caption')[0].string.strip()
            # two types of tables on bref, game level and non-game level
            if split_type[-5:] == 'Level':
                if year == None:  # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records
                    level_headings = [
                        th.get_text()
                        for th in splits[j].find("tr").find_all("th")
                    ][1:]
                else:
                    level_headings = [
                        th.get_text()
                        for th in splits[j].find("tr").find_all("th")
                    ][:]
                level_headings.append('Split Type')
                level_headings.append('Player ID')
                # singles data isn't included in the tables so this appends the column header
                level_headings.append('1B')
                level_data.append(level_headings)
                rows = splits[j].find_all('tr')
                for row in rows:
                    if year == None:  # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records
                        level_cols = row.find_all('td')
                    else:
                        level_cols = row.find_all(['th', 'td'])
                    level_cols = [ele.text.strip() for ele in level_cols]
                    if split_type != "By Inning":  # bbref added three empty columns to the by inning tables that don't match the rest of the tables. Not including this split table in results
                        level_cols.append(split_type)
                        level_cols.append(playerid)
                        level_data.append([ele for ele in level_cols])
            else:
                if year == None:  # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records
                    headings = [
                        th.get_text()
                        for th in splits[j].find("tr").find_all("th")
                    ][1:]
                else:
                    headings = [
                        th.get_text()
                        for th in splits[j].find("tr").find_all("th")
                    ][:]
                headings.append('Split Type')
                headings.append('Player ID')
                # singles data isn't included in the tables so this appends the column header
                headings.append('1B')
                data.append(headings)
                rows = splits[j].find_all('tr')
                for row in rows:
                    if year == None:  # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records
                        cols = row.find_all('td')
                    else:
                        cols = row.find_all(['th', 'td'])
                    cols = [ele.text.strip() for ele in cols]
                    if split_type != "By Inning":  # bbref added three empty columns to the by inning tables that don't match the rest of the tables. Not including this split table in results
                        cols.append(split_type)
                        cols.append(playerid)
                        data.append([ele for ele in cols])

    data = pd.DataFrame(data)
    data = data.rename(columns=data.iloc[0])
    data = data.reindex(data.index.drop(0))
    data = data.set_index(['Player ID', 'Split Type', 'Split'])
    data = data.drop(index=['Split'], level=2)
    data = data.apply(pd.to_numeric, errors='coerce').convert_dtypes()
    data = data.dropna(axis=1, how='all')
    data['1B'] = data['H'] - data['2B'] - data['3B'] - data['HR']
    data = data.loc[playerid]
    if pitching_splits is True:  # Returns Game Level tables as a second dataframe for pitching splits
        level_data = pd.DataFrame(level_data)
        level_data = level_data.rename(columns=level_data.iloc[0])
        level_data = level_data.reindex(level_data.index.drop(0))
        level_data = level_data.set_index(['Player ID', 'Split Type', 'Split'])
        level_data = level_data.drop(index=['Split'], level=2)
        level_data = level_data.apply(pd.to_numeric,
                                      errors='coerce').convert_dtypes()
        level_data = level_data.dropna(axis=1, how='all')
        level_data = level_data.loc[playerid]
        # data = pd.concat([data, level_data])
    if player_info is False:
        if pitching_splits is True:
            return data, level_data
        else:
            return data
    else:
        player_info_data = get_player_info(playerid=playerid, soup=soup)
        if pitching_splits is True:
            return data, player_info_data, level_data
        else:
            return data, player_info_data

コード例 #12

0

ファイルを表示

ファイル: prettyfy.py プロジェクト: ArioA/Gossip

def get_id(raw_html: str, tag_id: str) -> bs4.ResultSet:
    soup = bs4.BeautifulSoup(raw_html, 'html.parser')
    return soup.find_all("div", id=tag_id)

コード例 #13

0

ファイルを表示

ファイル: Rec.py プロジェクト: harry74sherman/watchers

def call_rec(sub, vid_id, seek_time):
    print("SEEK_TIME:"+seek_time)
    seek_time = int(seek_time)
    topic=sub.split()[0].lower()
    #nltk.download('punkt')
    #nltk.download('averaged_perceptron_tagger')
    #nltk.download('stopwords')
    dict=YouTubeTranscriptApi.get_transcript(vid_id,languages=['en'])
    transcript=''
    for i in range(len(dict)):
        if dict[i]['start']<seek_time:
            transcript=transcript+' '+dict[i]['text']
        else:
            break
    print(transcript)
    p = wikipedia.page(sub)
    #print(p.url)
    #print(p.title)
    content = p.content
    
    stop_words = set(stopwords.words('english')) 
    text= content + transcript
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words and len(word)>2])
    #print('the' in text.split())
    
    data = [] 
    from nltk.tokenize import sent_tokenize, word_tokenize
    
    # iterate through each sentence in the file 
    f = text.replace("\n", " ").replace(",","").replace("(","").replace(")","").replace(";","")
    
    for i in sent_tokenize(f): 
        temp = []       
        # tokenize the sentence into words 
        for j in word_tokenize(i): 
            if(j.isalpha() and j.lower() not in stop_words):
                temp.append(j.lower()) 
      
        data.append(temp) 
    
    #print('the' in data)  
    # Create CBOW model 
    model1 = Word2Vec(data, min_count = 1,  
                                  size = 100, window = 10) 
    
    model1.train(data, total_examples=1, epochs=50)
    
    #print("the" in model1.wv.vocab)
    topic_relevant=[]
    for t in model1.wv.most_similar(topic):
        topic_relevant.append(t[0])
    
    
    #print(topic_relevant) 
    about_topics=''
    for topics in topic_relevant:
        #print("***"+topics)
        response = requests.get("https://en.wikipedia.org/wiki/"+topics)

        about_topics +=topics+' :'

        if response is not None:
            html = bs4.BeautifulSoup(response.text, 'html.parser')
            paragraphs = html.select("p")
            #print(wikipedia.page(topics).content)
            for para in paragraphs:
                #print("##########################")
                #print(para.text)
                if len(para.text.split())>20:
                    about_topics=about_topics+para.text
                    break
            about_topics=about_topics+'\n'
        response.close();

    print(topic_relevant)
    return about_topics
    """

コード例 #14

0

ファイルを表示

ファイル: test.py プロジェクト: ProHiryu/Propheta

import bs4 as bs
import urllib.request

url = 'http://lishi.tianqi.com/shantou/201101.html'

source = urllib.request.urlopen(url)

soup = bs.BeautifulSoup(source,'html.parser')

uls = soup.find_all('ul')

for ul in uls:
    lis = ul.find_all('li')
    if len(lis[0].text) == 10:
        for li in lis:
            print(li.text)

コード例 #15

0

ファイルを表示

ファイル: simplebot_memes_es.py プロジェクト: simplebot-org/simplebot_memes_es

 def _get_image(url: str) -> tuple:
     with session.get(url) as res:
         res.raise_for_status()
         soup = bs4.BeautifulSoup(res.text, "html.parser")
     img = soup("div", class_="storyContent")[-1].img
     return (img["title"], img["src"])

コード例 #16

0

ファイルを表示

ファイル: lyrics.py プロジェクト: adamjakab/Beets

 def try_parse_html(html, **kwargs):
     try:
         return bs4.BeautifulSoup(html, 'html.parser', **kwargs)
     except HTMLParseError:
         return None

コード例 #17

0

ファイルを表示

ファイル: downloadXkcd.py プロジェクト: ProtozoaMaster/wHTMLinPY

#! python3
# downloadXckd.py - downloading all the comics from an website

import requests, os, bs4

url = 'http://xkcd.com'  #url inicial
os.makedirs('xkcd',
            exist_ok=True)  #armazena quadrinhos no ./xkcd e deixa claro
#que exista uma pasta pronta
while not url.endswith('#'):  #enquanto o url nao termina com a str '#'
    print('Downloading page %s...' % url)  #realiza o download
    res = requests.get(url)
    res.raise_for_status()  #garante que nao há erro no url
    soup = bs4.BeautifulSoup(res.text,
                             "html.parser")  #a funcao fica analizando e
    #buscando partes do codigo html que correspondam a variavel

    comicElem = soup.select('#comic img')  #procura o url da imagem
    if comicElem == []:
        print('Could not find comic image.')  #caso nao ache, printa isso
    else:
        comicUrl = comicElem[0].get('src')
        print('Downloading image %s...' % (comicUrl))
        res = requests.get(comicUrl)
        res.raise_for_status()
        #caso ele ache, o programa vai armazenando dentro do vetor as imagens dentro
        #do diretorio selecionado, sempre ao final da iteracao checando se o url
        #esta correto para evitar erros com a func raise_for_status()

        imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)),
                         'wb')

コード例 #18

0

ファイルを表示

add_arg('--html-tag', default='a', type=str, help='html tag you want to parse')
add_arg('--unique-id',
        default='',
        type=str,
        help='A common string in your links')
add_arg('--output-file',
        default='output_file.txt',
        type=str,
        help='A file to write to when stdout is activated')
args = parser.parse_args()

parsed_url = urlparse(*args.url)
domain = '{uri.netloc}'.format(uri=parsed_url)

url = Request(*args.url, headers=headers)
bs4_data = bs4.BeautifulSoup(urllib.request.urlopen(url), "lxml")
urls_list = []

if args.html_tag == 'a':
    href = 'href'
    for tag in bs4_data.find_all(args.html_tag):
        if args.unique_id:
            if args.unique_id == str(tag):
                urls_list.append(str(tag['href']))
        else:
            urls_list.append(str(tag['href']))

else:
    for tag in bs4_data.find_all(args.html_tag):
        if args.unique_id in str(tag):
            urls_list.append(str(tag))

コード例 #19

0

ファイルを表示

ファイル: parse_html_deps.py プロジェクト: raquelct/mibot-updates

  def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(six.text_type(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(six.text_type(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return six.text_type(soup).strip()

コード例 #20

0

ファイルを表示

ファイル: CrawlerBS4.py プロジェクト: alexbaselice/SE-Senior-Project

import requests
import bs4

res = requests.get('https://google.com/search?q='+'loyalty program million')
res.raise_for_status()

soup = bs4.BeautifulSoup(res.text, "html.parser")
linkElements = soup.select('.r a')
linkToSave = min(10, len(linkElements))
for i in range(linkToSave):
    with open('Links.txt', 'a+') as fo:
        fo.write('https://google.com' + linkElements[i].get('href') + '\n')
        fo.close()

        next_page = response.css('#foot a').attrib['href']

コード例 #21

0

ファイルを表示

ファイル: downloadXkcd.py プロジェクト: jordanengstrom/curiousZ_webscrape

#! python3
#downloadXkcd.py - Downloayds every single XKCD comic.

import requests, os, bs4

url = 'http://xkcd.com'  # starting url
os.makedirs('xkcd', exist_ok=True)  # store comics in ./xkcd

while not url.endswith('#'):
    # Download the page
    print('Downloading page %s...' % url)
    res = requests.get(url)
    res.raise_for_status()

    soup = bs4.BeautifulSoup(res.text)

    # Find the URL of the comic image.
    comicElem = soup.select('#comic img')
    if comicElem == []:
        print('Could not find comic image')
    else:
        comicUrl = 'http:' + comicElem[0].get('src')
        # Download the image
        print('Downloading image %s' % (comicUrl))
        res = requests.get(comicUrl)
        res.raise_for_status()

    # Save the image to ./xkcd
    imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb')
    for chunk in res.iter_content(100000):
        imageFile.write(chunk)

コード例 #22

0

ファイルを表示

ファイル: facebook_web_desc_scraper.py プロジェクト: praveenthakur124/mlwork

    def scraper(self):
        df = pd.read_csv(self.input_file)
        for idx, row in df['Facebook Page ID'].iteritems():
            print(idx)
            dic = {'id': '', 'about': '', 'products': '', 'web_url': '', 'web_url1': '', 'web_url2': '',
                   'category': ''}
            fb_id = str(row)
            fb_id = fb_id.split("'")
            fb_id = str(fb_id[1])
            print(fb_id)
            dic['id'] = fb_id
            url = "https://www.facebook.com/{}/about/".format(fb_id)
            header = {'accept-language': 'en-US,en;q=0.9'}
            try:
                resp = requests.get(url, headers=header)
                soup = bs4.BeautifulSoup(resp.text, 'html.parser')
                obj_list = soup.find_all('div', {'class': ['_50f4', '_3-8w']})
                for val in obj_list:
                    try:
                        if 'About'.lower() in val.getText().strip().lower():
                            about_data = val.find_next().getText()
                            print(about_data)
                            dic['about'] = about_data
                    except Exception as e:
                        print(e)

                for val1 in obj_list:
                    try:
                        if 'Products'.lower() in val1.getText().strip().lower():
                            product_data = val1.find_next().getText()
                            print(product_data)
                            dic['products'] = product_data
                    except Exception as e:
                        print(e)

                try:
                    data = Selector(text=resp.text)
                    web_url = data.xpath('//*[@id="u_0_p"]/div')
                    web_url = web_url.get('data')
                    web_url = str(web_url)
                    web_url = web_url.split(">")
                    web_url = str(web_url[1])
                    web_url = web_url.split("<")
                    web_url = web_url[0]
                    print(web_url)
                    dic['web_url'] = web_url
                except Exception as e:
                    print(e)

                try:
                    data1 = Selector(text=resp.text)
                    web_url1 = data1.xpath('//*[@id="u_0_q"]/div')
                    web_url1 = web_url1.get('data')
                    web_url1 = str(web_url1)
                    web_url1 = web_url1.split(">")
                    web_url1 = str(web_url1[1])
                    web_url1 = web_url1.split("<")
                    web_url1 = web_url1[0]
                    print(web_url1)
                    dic['web_url1'] = web_url1
                except Exception as e:
                    print(e)

                try:
                    data2 = Selector(text=resp.text)
                    web_url2 = data2.xpath('//*[@id="u_0_o"]/div')
                    web_url2 = web_url2.get('data')
                    web_url2 = str(web_url2)
                    web_url2 = web_url2.split(">")
                    web_url2 = str(web_url2[1])
                    web_url2 = web_url2.split("<")
                    web_url2 = web_url2[0]
                    print(web_url2)
                    dic['web_url2'] = web_url2
                except Exception as e:
                    print(e)

                try:
                    category_regex = r'\/pages\/category\/[0-9A-z-]+'
                    regex_compile = re.compile(category_regex)
                    search_category = regex_compile.findall(resp.text)[0]
                    search_category = str(search_category)
                    search_category = search_category.split("/")
                    search_category = str(search_category[3])
                    print(search_category)
                    dic['category'] = search_category
                except Exception as e:
                    print(e)
            except Exception as e:
                print(e)

            sleep(randint(5, 8))
            with open('/home/praveen/Working_files/Social_bakers_collection/Indian_top_facebook_brand_output.json', 'a') as output:
                json.dump(dic, output)
                output.write('\n')

コード例 #23

0

ファイルを表示

ファイル: Google_🔍.py プロジェクト: SmokerCat/NivaBot

async def apk(e):
    approved_userss = approved_users.find({})
    for ch in approved_userss:
        iid = ch["id"]
        userss = ch["user"]
    if e.is_group:
        if await is_register_admin(e.input_chat, e.message.sender_id):
            pass
        elif e.chat_id == iid and e.sender_id == userss:
            pass
        else:
            return
    try:
        app_name = e.pattern_match.group(1)
        remove_space = app_name.split(" ")
        final_name = "+".join(remove_space)
        page = requests.get(
            "https://play.google.com/store/search?q=" + final_name + "&c=apps"
        )
        lnk = str(page.status_code)
        soup = bs4.BeautifulSoup(page.content, "lxml", from_encoding="utf-8")
        results = soup.findAll("div", "ZmHEEd")
        app_name = (
            results[0].findNext("div", "Vpfmgd").findNext("div", "WsMG1c nnK0zc").text
        )
        app_dev = results[0].findNext("div", "Vpfmgd").findNext("div", "KoLSrc").text
        app_dev_link = (
            "https://play.google.com"
            + results[0].findNext("div", "Vpfmgd").findNext("a", "mnKHRc")["href"]
        )
        app_rating = (
            results[0]
            .findNext("div", "Vpfmgd")
            .findNext("div", "pf5lIe")
            .find("div")["aria-label"]
        )
        app_link = (
            "https://play.google.com"
            + results[0]
            .findNext("div", "Vpfmgd")
            .findNext("div", "vU6FJ p63iDd")
            .a["href"]
        )
        app_icon = (
            results[0]
            .findNext("div", "Vpfmgd")
            .findNext("div", "uzcko")
            .img["data-src"]
        )
        app_details = "<a href='" + app_icon + "'>ЁЯУ▓&#8203;</a>"
        app_details += " <b>" + app_name + "</b>"
        app_details += (
            "\n\n<code>Developer :</code> <a href='"
            + app_dev_link
            + "'>"
            + app_dev
            + "</a>"
        )
        app_details += "\n<code>Rating :</code> " + app_rating.replace(
            "Rated ", "тнР "
        ).replace(" out of ", "/").replace(" stars", "", 1).replace(
            " stars", "тнР "
        ).replace(
            "five", "5"
        )
        app_details += (
            "\n<code>Features :</code> <a href='"
            + app_link
            + "'>View in Play Store</a>"
        )
        app_details += "\n\n===> @MissJuliaRobot <==="
        await e.reply(app_details, link_preview=True, parse_mode="HTML")
    except IndexError:
        await e.reply("No result found in search. Please enter **Valid app name**")
    except Exception as err:
        await e.reply("Exception Occured:- " + str(err))

コード例 #24

0

ファイルを表示

ファイル: Extract (main.py).py プロジェクト: joshuajolly/Yosemite-Elevation-Project

import googlemaps
import os
import sys
import zipfile

#GET NAME OF FILE
name = sys.argv[1]

#EXTRACT KMZ
with zipfile.ZipFile(name, "r") as zipper:
	zipper.extractall("")

#OPEN FILE
data = codecs.open('doc.kml', encoding = 'utf-8').read()
os.remove('doc.kml')

#PARSING XHTML
doc = bs4.BeautifulSoup(data,'html.parser')
name = doc.find('placemark').find('name').text
coords = doc.find('placemark').find('point').find('coordinates').text.split(',')
coords = (float(coords[1]), float(coords[0]))

#GETTING ELEVATION
client = googlemaps.Client(key = open('Data\elevation key.txt').read())
elevation = googlemaps.elevation.elevation(client, locations = coords)[0]['elevation']

#PRINT RESULT
print("Name: " + name)
print("Elevation: " + str(elevation))

input()

コード例 #25

0

ファイルを表示

ファイル: vg_scraper.py プロジェクト: Sivaranjith1/django_VG

import bs4 as bs
import urllib.request
#MADE BY RANJITH

home = urllib.request.urlopen('http://www.vg.no/').read()
soup = bs.BeautifulSoup(home, 'lxml')
list = []
main_li = []


def scraper():
    def article_list(list):
        for article in soup.find_all('div', class_='article-content'):
            link = article.find('a')
            try:
                link = link.get('href')
                if "http" not in link:
                    if 'nyheter/' in link:
                        list.append("http://www.vg.no" + link)
            except:
                pass

    def article_read(article):
        site = urllib.request.urlopen(article).read()
        soup = bs.BeautifulSoup(site, 'lxml')

        def article_title():
            header = soup.find('div', class_='reg-grid-full')
            title = header.find('h1', class_='main-title')
            title = title.text.strip()
            try:

コード例 #26

0

ファイルを表示

import bs4 as bs
from urllib.request import Request, urlopen
import pandas as pd
import datetime
import csv

now = str(datetime.datetime.now())[:10]
sauce = Request('https://www.the303columbus.com/floorplans.aspx',
                headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(sauce).read()

soup = bs.BeautifulSoup(webpage, 'lxml')
table = soup.find_all('table')
myrow = []
for tr in table:
    td = tr.find_all('td')
    row = [i.text for i in td]
    myrow.append(row)

myrow = pd.DataFrame(myrow)
myrow = myrow.drop(myrow.columns[[0, 2, 4, 6, 8, 9, 10, 11, 12, 13]], axis=1)
myrow.columns = ['bed', 'bath', 'SQ.FT', 'rent']
myrow.index = ['Glenn', 'Nicklaus', 'Allen', 'Stine', 'Oakley', 'Campbell']
myrow.index.name = now

filename = datetime.datetime.now().strftime('columbus303-%Y-%m-%d.csv')

myrow.to_csv(filename)

コード例 #27

0

ファイルを表示

ファイル: topcoder.py プロジェクト: cake-monotone/api-client

    def _download_data(self, *, session: Optional[requests.Session] = None) -> _TopcoderData:
        session = session or utils.get_default_session()

        # download HTML
        url = 'https://community.topcoder.com/stat?c=problem_statement&pm={}'.format(self.problem_id)
        resp = utils.request('GET', url, session=session)

        # parse HTML
        soup = bs4.BeautifulSoup(resp.content.decode(resp.encoding), utils.html_parser)

        problem_texts = soup.find_all('td', class_='problemText')
        if len(problem_texts) != 1:
            raise SampleParseError("""<td class="problemText"> is not found or not unique""")
        problem_text = problem_texts[0]

        # parse Definition section
        # format:
        #     <tr>...<h3>Definition</h3>...<tr>
        #     <tr><td>...</td>
        #         <td><table>
        #             ...
        #             <tr><td>Class:</td><td>...</td></tr>
        #             <tr><td>Method:</td><td>...</td></tr>
        #             ...
        #         </table></td></tr>
        log.debug('parse Definition section')
        h3 = problem_text.find('h3', text='Definition')
        if h3 is None:
            raise SampleParseError("""<h3>Definition</h3> is not found""")
        definition = {}
        for text, key in {
                'Class:': 'class',
                'Method:': 'method',
                'Parameters:': 'parameters',
                'Returns:': 'returns',
                'Method signature:': 'method_signature',
        }.items():
            td = h3.parent.parent.next_sibling.find('td', class_='statText', text=text)
            log.debug('%s', td.parent)
            definition[key] = td.next_sibling.string

        # parse Examples section
        # format:
        #     <tr>...<h3>Examples</h3>...<tr>
        #     <tr><td>0)</td><td></td></tr>
        #     <tr><td></td>
        #         <td><table>
        #             ...
        #             <pre>{5, 8}</pre>
        #             <pre>"foo"</pre>
        #             <pre>3.5</pre>
        #             <pre>Returns: 40.0</pre>
        #             ...
        #         </table></td></tr>
        #     <tr><td>1)</td><td></td></tr>
        #     ...
        log.debug('parse Examples section')
        h3 = problem_text.find('h3', text='Examples')
        if h3 is None:
            raise SampleParseError("""<h3>Examples</h3> is not found""")

        raw_sample_cases = []  # type: List[Tuple[List[str], str]]
        cursor = h3.parent.parent
        while True:
            # read the header like "0)"
            cursor = cursor.next_sibling
            log.debug('%s', cursor)
            if not cursor or cursor.name != 'tr':
                break
            if cursor.find('td').string != '{})'.format(len(raw_sample_cases)):
                raise SampleParseError("""<td ...>){})</td> is expected, but not found""".format(len(raw_sample_cases)))

            # collect <pre>s
            cursor = cursor.next_sibling
            log.debug('%s', cursor)
            if not cursor or cursor.name != 'tr':
                raise SampleParseError("""<tr>...</tr> is expected, but not found""")
            input_items = []
            for pre in cursor.find_all('pre'):
                marker = 'Returns: '
                if pre.string.startswith(marker):
                    output_item = pre.string[len(marker):]
                    break
                else:
                    input_items.append(pre.string)
            else:
                raise SampleParseError("""<pre>Returns: ...</pre> is expected, but not found""")
            raw_sample_cases.append((input_items, output_item))

        # convert samples cases to the Greed format
        sample_cases = []
        for i, (input_items, output_item) in enumerate(raw_sample_cases):
            sample_cases.append(TestCase(
                'example-{}'.format(i),
                'input',
                ('\n'.join(map(_convert_to_greed, input_items)) + '\n').encode(),
                'output',
                (_convert_to_greed(output_item) + '\n').encode(),
            ))

        return _TopcoderData(definition=definition, raw_sample_cases=raw_sample_cases, sample_cases=sample_cases)

コード例 #28

0

ファイルを表示

ファイル: 042_lucky.py プロジェクト: ddhinglajia/automate_stuff_python

#! /usr/bin/python3
# Open several Google search results.

import requests, sys, webbrowser, bs4

print('Googling...')  # display text while downloading the Google page
res = requests.get('http://google.com/search?q=' + ' '.join(sys.argv[1:]))
res.raise_for_status()

# Retrieve top search result links.
soup = bs4.BeautifulSoup(res.text, "lxml")
# Open a browser tab for each result.
linkElems = soup.select('.r a')
numOpen = min(5, len(linkElems))

for i in range(numOpen):
    webbrowser.open('http://google.com' + linkElems[i].get('href'))

コード例 #29

0

ファイルを表示

import pandas as pd
#%%

add_1 = 'https://en.wikipedia.org/wiki/Lists_of_writers'
#page = requests.get('https://www.newswire.ca/news/air-canada?page=1&pagesize=100')
add_2 = 'https://www.newswire.ca/news/air-canada?page=1&pagesize=100'
add_3 = 'https://www.cision.ca/resources/'
page = requests.get(add_2)





#soup = bs4.BeautifulSoup(page)

soup = bs4.BeautifulSoup(page.content, 'html.parser')
names = soup.findAll('a')
#%%
BaseURL = 'https://www.newswire.ca'

Titr = []
URL = []

for i in range(1, 11):
    add = 'https://www.newswire.ca/news/air-canada?page=' + str(i) + '&pagesize=100'
    page = requests.get(add)
    soup = bs4.BeautifulSoup(page.content, 'html.parser')
    names = soup.findAll('a')
    print(add)
    for name in names:
        try:

コード例 #30

0

ファイルを表示

ファイル: findAuthor.py プロジェクト: sunxiaoou/py

#! /usr/local/bin/python3

import bs4

exampleFile = open('example.html')
exampleSoup = bs4.BeautifulSoup(exampleFile.read(), "lxml")

print(type(exampleSoup))
# <class 'bs4.BeautifulSoup'>

elems = exampleSoup.select('#author')

print(type(elems))
# <class 'list'>

print(len(elems))
# 1

print(type(elems[0]))
# <class 'bs4.element.Tag'>

print(str(elems[0]))
# '<span id="author">Al Sweigart</span>'

print(elems[0].getText())
# 'Al Sweigart'

print(elems[0].attrs)
# {'id': 'author'}

print()