Esempio n. 1
0
 def GetPageData(self, data_list):
     str = 'https://piao.damai.cn/'
     for i in data_list:
         Req = urllib.request.Request(str + i + ".html",
                                      headers=self.header)
         pagedata = urllib.request.urlopen(Req, timeout=5000).read()
         page = bt(pagedata, 'lxml')
         showtime = page.find('div',
                              class_='m-sdbox m-showtime').span.string
         venue = page.find('div', class_='m-sdbox m-venue').div.a.string
         title = page.find('h2', class_='tt').span.string
         print(title, venue, showtime, sep='|*-*|')
         time.sleep(2)
Esempio n. 2
0
def fetchtopic():
    res = urllib2.urlopen('http://www.cnbeta.com/#1')   
    #res = res.readlines()
    #print res

    soup = bt(res,from_encoding='gbk')
    r = soup.findAll(attrs={'class':'topic'})
    l=[]
    for one in r:
        d={}
        d['url']=host + one.a['href']
        d['title']=one.a.strong.string
        l.append(d)
    return l    
Esempio n. 3
0
import requests
from bs4 import BeautifulSoup as bt

response = requests.get('https://github.com/login')

obj = bt(response.text, 'html.parser')

# 获取token
login = obj.find(name='input', attrs={
    'name': 'authenticity_token'
}).get('value')
print(login)
Esempio n. 4
0
import requests
from bs4 import BeautifulSoup as bt
item = 'shoes'
url = "https://www.flipkart.com/search?q=" + item + "&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off"
source = requests.get(url).text
'''
fid = open('test_html.txt','w')
print(source.encode('utf-8'),file = fid)
fid.close()
'''
soup = bt(source, 'lxml')
match = soup.find_all('div', class_='_1vC4OE')
for z in match:
    print(z)
Esempio n. 5
0
import requests
from bs4 import BeautifulSoup as bt
import json
import re

URL = 'https://en.wikipedia.org/wiki/Solar_cell'
page = requests.get(URL)
# print(page.content)

# Scrap the page
soup = bt(page.content, 'html.parser')
"""
Target div that contain the paragraphs

"""
content = soup.find(id='mw-content-text')


def get_citations_needed_count():
    """
    Report the number of paragraph that need citation

    """
    number_of_citations_needed = content.find_all(
        class_="noprint Inline-Template Template-Fact")
    return f"Number of citation :{len(number_of_citations_needed)}"


print(get_citations_needed_count())

Esempio n. 6
0
def fetchdetail(url):
    res = urllib2.urlopen(url)
    soup = bt(res,from_encoding='gbk')
    #r = soup.find(id='news_content')
    return soup