Example #1
0
def parsePrice():
    r = requests.get('yahoo url')
    soup = bs4.beautifulsoup(r.text, "xml")
    price = soup.find_all(
        'div',
        {'class': 'My(6px) Pos(r) smartphone_Mt(6px)'})[0].find('span').text
    return price
Example #2
0
def get(num, overwrite):
    euler_url = 'http://projecteuler.net/'
    url = '{0}problem={1}'.format(euler_url, num)
    r = requests.get(url)
    soup = beautifulsoup(r.content)
    c = soup.find_all(id='content')[0]
    problem_text = []
    resource_file = None
    for x in c.find_all('sup'):
        x.string = '^' + ''.join(list(x.stripped_strings))

    for x in c.find_all('sub'):
        x.string = '_' + ''.join(list(x.stripped_strings))

    for x in c.find_all('div', class_='problem_content'):
        if x.find_all('a'):
            resource_file = '{0}{1}'.format(
                euler_url, x.find_all('a', href=True)[0]['href'])
        problem_text.append(' '.join(list(x.stripped_strings)))

    problem_text = '\n'.join(textwrap.wrap('\n'.join(problem_text)))
    problem_text = problem_text.replace(' ^', '^')
    problem_text = problem_text.replace('^ ', '^')
    problem_text = problem_text.replace(' _', '_')
    problem_text = problem_text.replace('_ ', '_')
    problem_name = (c.find_all('h2')[0].string
                    .lower()
                    .replace(' ', '_')
                    .replace('-', '_'))

    print(num)
    print(problem_name, end='\n\n')
    print(problem_text)

    if resource_file:
        resource_file_name = './data/{0:03d}_{1}.txt'.format(num, problem_name)
        make_file(resource_file_name,
                  requests.get(resource_file).content,
                  executable=False)

        resource = ("\nwith open('{0}', 'r') as f:\n    DATA = f.readlines()\n\n"
                    ''.format(resource_file_name))

    else:
        resource = ''

    t = Template(TEMPLATE)
    s = t.safe_substitute(number=num, text=problem_text, resource=resource)
    file_name = './{0:03d}_{1}.py'.format(num, problem_name)
    if not os.path.isfile(file_name):
        make_file(file_name, s)
    else:
        if overwrite:
            make_file(file_name, s)
        else:
            print('\n{0} already exists'.format(file_name))
def get_container(product):
    global URL
    tokens = nltk.word_tokenize(product)
    for token in tokens:
        URL += token + '%20'
    page = requests.get(URL)
    soup = beautifulsoup(page.content, features="html.parser")
    print(soup.prettify())
    card_grid = soup.find("div", id="card_grid", recursive=True)

    return card_grid
	def set_title_and_meta(self,htmltext):
		'before fetch feature,clear the html file'
		soup=beautifulsoup(htmltext.decode('utf-8'))
		comments = soup.findAll(text=lambda text:isinstance(text, Comment))
		[comment.extract() for comment in comments]
		[style.extract() for style in soup.findAll('style')]
		[script.extract() for script in soup.findAll('script')]
		[form.extract() for form in soup.findAll('form')]
		[table.extract() for table in soup.findAll('table')]
		self.set_title(soup)
		self.set_meta(soup)
		htmldoc=re.sub('<[^>]*>','',soup.body.prettify()).encode('utf-8') if soup.body else None
		self.set_permit_and_content(htmldoc)
 def set_title_and_meta(self, htmltext):
     'before fetch feature,clear the html file'
     soup = beautifulsoup(htmltext.decode('utf-8'))
     comments = soup.findAll(text=lambda text: isinstance(text, Comment))
     [comment.extract() for comment in comments]
     [style.extract() for style in soup.findAll('style')]
     [script.extract() for script in soup.findAll('script')]
     [form.extract() for form in soup.findAll('form')]
     [table.extract() for table in soup.findAll('table')]
     self.set_title(soup)
     self.set_meta(soup)
     htmldoc = re.sub(
         '<[^>]*>', '',
         soup.body.prettify()).encode('utf-8') if soup.body else None
     self.set_permit_and_content(htmldoc)
Example #6
0
def get_events():
    """Get the events from the webpage and format them for use in Google Calendar."""
    content = get_content()
    events = []
    for page in content:
        month = page["month"].month
        year = page["month"].year
        soup = beautifulsoup(page["content"], "lxml")
        agenda_items = soup.find_all('ul', class_='agendaitems', attrs={})

        # The calendar page has two columns for events.
        for column in agenda_items:
            items = column.find_all('li')  # Each event is a list.
            for item in items:
                title = item.find("span", class_='kopje').get_text()
                url = CREA_MORE_INFO_ADDRESS.format(item.find_all("a", href=True)[0]['href'])
                description = item.find('span', class_='tekst').get_text()
                item_datetime = item.find('em', class_="datum").get_text().split('|')

                dates = ['-'.join([year, month, a[1]]) for a in [d.strip().split(' ') for d in item_datetime[:-1]]]


                # Gracefully handle events that occur on multiple days.
                for date in dates:
                    start_time = item_datetime[-1].strip() + ':00'
                    event_start = date + 'T' + start_time
                    event_end = (datetime.datetime.strptime(event_start, '%Y-%m-%dT%H:%M:%S') + datetime.timedelta(hours=2)).strftime('%Y-%m-%dT%H:%M:%S')

                    # Hashes are used as a unique identifier to prevent the same event from being added twice.
                    event_hash = hashlib.sha224(str(title + url + date + start_time + description).encode('utf-8')).hexdigest()

                    event = {"summary": title,
                             "start": {
                                    "dateTime": event_start,
                                    "timeZone": "Europe/Amsterdam"
                                },
                             "end": {
                                    "dateTime": event_end,
                                    "timeZone": "Europe/Amsterdam"
                                },
                             "description": url + description,
                             "id": event_hash}
                    events.append(event)
    return events
def md5_decrypt(encrypt_string):
	headers_somd5 = {
	'x-requested-with': 'XMLHttpRequest',
	'Accept-Language': 'zh-cn',
	'Referer': 'http://www.somd5.com/',
	'Accept': '*/*',
	'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
	'Accept-Encoding': 'gzip, deflate',
	'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
	'Host': 'www.somd5.com',
	'Proxy-Connection': 'Keep-Alive',
	'Pragma': 'no-cache'
	}
	url_somd5 = 'http://www.somd5.com/somd5-index-md5.html'
	data = 'isajax=QoG29V7X6mEGHt6Ep8pTI43&md5='+encrypt_string
	cj = cookiejar.CookieJar()
	opener = request.build_opener(request.HTTPCookieProcessor(cj))
	req1 = opener.open(url_somd5)
	resp1 = req1.read()
	for a in cj:
		cookie_name = a.name
		cookie_value = a.value
	length = 35+len(encrypt_string)
	data_encode = data.encode()
	req2 = request.Request(url=url_somd5, data = data_encode, headers = headers_somd5,\
				method = 'POST')
	req2.add_header('Content-Length',length)	
	req2.add_header('Cookie',cookie_name+':'+cookie_value)
	resp2 = request.urlopen(req2)
	resp2 = resp2.read(1000)

	try:
		resp2_decode = resp2.decode(encoding = 'gb18030')
	except:
		resp2_decode = resp2.decode(encoding = 'utf-8')
	
	bs = beautifulsoup(resp2_decode)

	print('开始解密...')
	for b in bs.find_all('h1'):
		text = b.get_text()
		print('解密后:{}'.format(text))
		sys.exit()
	print('解密失败了...')	
def main():
	global f_w
	t = datetime.datetime.now()
	t_format = t.strftime('%Y-%m-%d-%H-%M-%S')#输出时间格式
	f_name = 'cirt.net-passwords-' + t_format + '.txt'
	f_w = open(f_name, 'w+')

	req = request.urlopen(url)	 
	resp = req.read()
	resp_decode = resp.decode()
	#print(resp_decode)
	bs = beautifulsoup(resp_decode)
	print('\n starting to process...\n')
	for a in bs.find_all('tr'):#这个是用于处理页面厂商名称的,用于下面获取厂商页面的默认密码
	#print(a)  #格式为  <tr><td><a href="?vendor=Huawei Technologies Co">Huawei Technologies Co</a></td><td><a href="?vendor=Hyperic, Inc.">Hyperic, Inc.</a></td><td><a href="?vendor=IBM">IBM</a></td></tr>
		for b in a.find_all('a'):
		#print('\n')
			#print(b.get_text())#打印出厂商名称 Huawei Technologies Co
			cs_name = b.get_text()
			changshang.append(cs_name)
	
	print(changshang)
	length_cs = len(changshang)
	print('cs len is :{}'.format(length_cs))
	for i in range(0, length_cs, 10):#使用多线程  
		k = length_cs -i 
		if k>10:
			k = 10
		else:
			k = k
		for j in range(k):#使用多少(k)个线程, 输出时需要价格锁Lock,濡染会很乱
			j = i+j
			t = threading.Thread(target = huoqu, args = (changshang[j],))
			threads.append(t)
		for m in range(len(threads)):
			threads[m].start()
		for n in range(len(threads)):
			threads[n].join()
		threads.clear()
	print('工作完成,我要睡一会了...')
	sleep(6)
	f_w.close()
def main():
    global f_w
    t = datetime.datetime.now()
    t_format = t.strftime('%Y-%m-%d-%H-%M-%S')  #输出时间格式
    f_name = 'cirt.net-passwords-' + t_format + '.txt'
    f_w = open(f_name, 'w+')

    req = request.urlopen(url)
    resp = req.read()
    resp_decode = resp.decode()
    #print(resp_decode)
    bs = beautifulsoup(resp_decode)
    print('\n starting to process...\n')
    for a in bs.find_all('tr'):  #这个是用于处理页面厂商名称的,用于下面获取厂商页面的默认密码
        #print(a)  #格式为  <tr><td><a href="?vendor=Huawei Technologies Co">Huawei Technologies Co</a></td><td><a href="?vendor=Hyperic, Inc.">Hyperic, Inc.</a></td><td><a href="?vendor=IBM">IBM</a></td></tr>
        for b in a.find_all('a'):
            #print('\n')
            #print(b.get_text())#打印出厂商名称 Huawei Technologies Co
            cs_name = b.get_text()
            changshang.append(cs_name)

    print(changshang)
    length_cs = len(changshang)
    print('cs len is :{}'.format(length_cs))
    for i in range(0, length_cs, 10):  #使用多线程
        k = length_cs - i
        if k > 10:
            k = 10
        else:
            k = k
        for j in range(k):  #使用多少(k)个线程, 输出时需要价格锁Lock,濡染会很乱
            j = i + j
            t = threading.Thread(target=huoqu, args=(changshang[j], ))
            threads.append(t)
        for m in range(len(threads)):
            threads[m].start()
        for n in range(len(threads)):
            threads[n].join()
        threads.clear()
    print('工作完成,我要睡一会了...')
    sleep(6)
    f_w.close()
def meiju(k):
	global flag, count#这个也要再次声明为global,不然就会当成局部变量
	try:
		data = url + '/?author=' + str(k)
		req = request.urlopen(data, timeout = 20)#网络太烂设置的超时时间长一些
		#print('starting to read...')
		resp = req.read(2000)
		#print(resp)
		data_decode = resp.decode(encoding = 'utf-8')
		#sleep(4)
		#print('\n\n\nafter decode : {}'.format(data_decode))
		data_soup = beautifulsoup(data_decode, from_encoding='utf-8')
		#print(data_soup)
		for a in data_soup.find_all('title'):
			#print(a)#<title>kelz - FreeBuf.COM</title>
			b = a.get_text().strip()#由于freebuf的格式为
				#<title>
				#  sn0rt - FreeBuf.COM</title>需要去除空格   变为#sn0rt - FreeBuf.COM
			b = b.split(' ')  #以-分割 [sn0rt, FreeBuf.COM] #具体分隔符还要看具体的站配置,一般用户名后面都有空格
			#print(b[0]) #就打印出sn0rt了
			b = b[0].strip()#进一步去除空格 不然在用户名后存在空格
			print('用户{0}:{1}'.format(k, b))#a.get_text()))#kelz - FreeBuf.COM
			f_w.write('用户'+str(k)+': '+b+'\n')#k是int型, 写入的是字符型
	except Exception as e:
		excepts.append(k)
		length = len(excepts)
		if length > 3:#试了相邻两个相减差值为1,判断用户结束  也可以
			excepts.sort()#类似冒泡排序,这个自带的函数
			for i in range(1,length):
				if i+1 < length:
					if (excepts[i]-excepts[i-1] == 1) and (excepts[i+1] - excepts[i] == 1): 
						#print(excepts[i-1], excepts[i], excepts[i+1])
						#实际由于多线程不一定相邻相减,有任意两个相减为1
						count.append(excepts[i-1])
						flag = False

			if len(count)>3:
				count = list(set(count))
				print('count is : {}'.format(count))
		else:			
			print('{0} error is : {1}'.format(k, e))
def huoqu(huoqu_url):  #打开一个厂商默认用户密码的页面
    url_hq = cs_url + parse.quote(huoqu_url)  #得到具体页面url,需要quote编码,不然会出错
    print(url_hq)
    requ = request.urlopen(url_hq)
    response = requ.read()
    response_decode = response.decode()
    bs_huoqu = beautifulsoup(response_decode)
    print('\n厂商名称:{}'.format(huoqu_url))
    f_w.write('厂商名称:{}'.format(huoqu_url) + '\n')
    lock = threading.Lock()  #加入线程锁,防止争抢资源,一个一个的输出不会乱
    lock.acquire()

    for c in bs_huoqu.find_all('tr'):
        result = re.search(patt_href, c.get_text())
        if result is not None:
            pass
        else:
            result_title = re.match('\d+', c.get_text())  #标题直接输出,不用像下面那样分割
            flag = False
            if result_title is None:  #不知道啥原因,只能有一行语句,无奈只好返回标志
                flag = True
            else:
                print(c.get_text())
                f_w.write(c.get_text() + '\n')
            if flag:  #用于输出格式好看点, username : password
                d = c.get_text('\t: ')
                split_v = d.split(':')
                v1 = split_v[0]  #User ID 或者PASSWORD
                v1 = v1.strip('\t')
                length = len(v1)
                if length == 8:
                    print(c.get_text(' : '))
                    f_w.write(c.get_text(' : ') + '\n')
                else:
                    print(c.get_text('\t : '))
                    f_w.write(c.get_text('\t : ') + '\n')
    print('\n\n')
    f_w.write('\n')  #每个厂商之间隔一行
    lock.release()
def huoqu(huoqu_url):#打开一个厂商默认用户密码的页面
	url_hq = cs_url+parse.quote(huoqu_url)#得到具体页面url,需要quote编码,不然会出错
	print(url_hq)
	requ = request.urlopen(url_hq)
	response = requ.read()
	response_decode = response.decode()
	bs_huoqu = beautifulsoup(response_decode)
	print('\n厂商名称:{}'.format(huoqu_url))	
	f_w.write('厂商名称:{}'.format(huoqu_url) + '\n')
	lock = threading.Lock()#加入线程锁,防止争抢资源,一个一个的输出不会乱
	lock.acquire()	

	for c in bs_huoqu.find_all('tr'):
		result = re.search(patt_href, c.get_text())
		if result is not None:
			pass
		else:
			result_title = re.match('\d+', c.get_text())#标题直接输出,不用像下面那样分割
			flag = False
			if result_title is  None:#不知道啥原因,只能有一行语句,无奈只好返回标志
				flag = True
			else:
				print(c.get_text())
				f_w.write(c.get_text() + '\n')
			if flag:#用于输出格式好看点, username : password 
				d = c.get_text('\t: ')
				split_v = d.split(':')
				v1 = split_v[0]#User ID 或者PASSWORD
				v1 = v1.strip('\t')
				length = len(v1)
				if length == 8:
					print(c.get_text(' : '))
					f_w.write(c.get_text(' : ') + '\n')
				else:
					print(c.get_text('\t : '))
					f_w.write(c.get_text('\t : ') + '\n')
	print('\n\n')
	f_w.write('\n')#每个厂商之间隔一行
	lock.release()
Example #13
0
import bs4 as bs
import urllib.request
import sqlite3
#baglanti=sqlite3.connect("ornek.db")
#isaretci=baglanti.cursor()
#tablo=isaretci.execute('''CREATE TABLE haberler''')
kaynak=urllib.request.urlopen("http://www.milliyet.com").read()
sayfa=bs.beautifulsoup(kaynak,'lxml')
tablo=isaretci.execute('''CREATE TABLE linkler(id INTEGER PRIMARY KEY,link VARCHAR(255))''')

for nav in sayfa.findAll('a'):
    isaretci.execute('''INSERT INTO linkler(link)VALUES(2)''',([nav.get('href')))
                  
    print(nav.get('href'))
print(sayfa.title.string)
print(sayfa.findAll('p'))
#p etiketi olan şeylerbuluyor
for paragraf in sayfa.findAll('p'):
    #isaretci.execute('''INSERT INTO haberler(haber)VALUES(?)''',(paragraf.string))
    print(paragraf.string)
sonuc=isaretci.execute("SELECT*FROM linkler")
print(sonuc.fetchall())
                     
Example #14
0
 def createParser(self, rawHtml):
     self.parser = beautifulsoup(rawHtml, 'html.parser')
Example #15
0
 def __init__(self, rawHtml):
     self.parser = beautifulsoup(rawHtml, 'html.parser')
Example #16
0
import bs4
from urllib import request
res = request.get('http://nostarch.com')
res.raise_for_status()
nostarchsoup = bs4.beautifulsoup(res.text)
type(nostarchsoup)
Example #17
0
def get_page(url):
    return beautifulsoup(GET(url))
Example #18
0
def get_page(url):
    return beautifulsoup(GET(url))
import requests, bs4, openpyxl

#open excel and identify sheet to dump data into
wb = openpyxl.load_from_workbook('Book1.xlsx')
sheet = wb.get_sheet_by_name('Sheet1')

#identify website and obtain html document for web page
url_str = 'website.com'
res = requests.get(url_str)
res.raise_for_status() #checks for html error codes when accessing page
htmldoc = bs4.beautifulsoup(res.text)

#identify selectors and put them into a list
elements_list  htmldoc.select('.question-summary narrow') #selects the class ('.' indicats html type class)

#for each instance in elements_list, add the title to a MS excel file
i=2
for each instance in elements_list:
	sheet.cell(row=i,column=1).value = instance.get_text()
	sheet.cell(row=i,column=2).value = instance.get('href')
	i += 1
	
#save the workbook and close
wb.save('Book1.xlsx')
wb.close()
Example #20
0
class Connection(object):
    def __init__(self, username=None, password=None, verify=False):
        self.headers = {'User-Agent': 'Mozilla/5.0'}
        self.session = requests.Session()
        self.verify = verify
        if username and password:
            self.base_uri = '://www.nsof.class.noaa.gov/saa/products/'
            self.authenticate = Auth(username, password)
            self.get('welcome')
            self.translator = Translator()
            self.authenticate.do(self)
            self.request = Request(self)
            self.subscribe = Subscribe(self)
        else:
            self.base_uri = '://www.nsof.class.noaa.gov'

    def next_up_datetime(self):
        end = datetime.utcnow()
        self.get('')
        middle = self.last_response_soup.select('#middle p')
        if len(middle) > 0:
            text = middle[1].text
            regex = re.compile(", (.*), from (.*) UTC .* through (.*) UTC")
            params = list(regex.findall(text)[0])
            pattern = '%m/%d/%y %H%M'
            begin = datetime.strptime('%s %s' % tuple(params[0:2]), pattern)
            end = datetime.strptime('%s %s' % (params[0], params[2]), pattern)
            if begin >= end:
                end += timedelta(days=1)
        from pytz import utc
        return end.replace(tzinfo=utc)

    @property
    def cookies(self):
        self._cookies = requests.utils.cookiejar_from_dict(
            requests.utils.dict_from_cookiejar(self.session.cookies))
        return self._cookies

    @property
    def last_response(self):
        return self._last_response

    @last_response.setter
    def last_response(self, response):
        packed = self.pack(response).select('h1')
        if (response.status_code != requests.codes.ok
                or (packed and 'An Error Occurred' in packed[0].text)):
            raise Exception('Connection error (%i).' % response.status_code)
        self._last_response = response

    @property
    def last_response_soup(self):
        return self.pack(self.last_response)

    def get(self, url, proto='http'):
        """
        Load an url using the GET method.

        Keyword arguments:
        url -- the Universal Resource Location
        proto -- the protocol (default 'http')
        """
        self.last_response = self.session.get(proto + self.base_uri + url,
                                              headers=self.headers,
                                              cookies=self.cookies,
                                              allow_redirects=True,
                                              verify=self.verify)
        return self.last_response_soup

    def pack(self, response, async=False):
        soup = beautifulsoup(response.text)
        if async:
            response.close()
        return soup
Example #21
0
import requests
vnexpress = requests.get("https://vnexpress.net/")
print(vnexpress)
file_name = "vnexpress.html"
file_html = open(file_name, "wb")
file_html.write(vnexpress.content)
file_html.close()
decoded_content = open_file.read().decoded('utf-8')
from bs4 import beautifulsoup
trangweb_express = beautifulsoup(decoded_content, "html.parser")
print(trangweb_vnexpress.find("div", attrs = {"class":"scroll-pane"}))
                 
Example #22
0
 def table(self):
     page = requests.get('http://www.star.nesdis.noaa.gov/smcd/spb/fwu/'
                         'homepage/GOES_Imager_Vis_OpCal.php')
     pq = beautifulsoup(page.text)
     return (pq.select("table")[2]).select("tr")
Example #23
0
import requests
from bs4 import beautifulsoup
import json

fungame = []

url="https://www.miniclip.com/games/en/"

page=requests.get(url).content
soup=beautifulsoup(page,'html.parser')
games=soup.find_all('article',class_='slick-slide')

for game in games:
    if 'Play_code' in game['class']:
        continue


picture=game.find('img').findAll(itemprop="image")

mygame=('picture':picture)

fungme.append(mygame)

#print (games[0].prettify())



with open('data.json','w')as outfile:
    json.dump(fungame.outfile)
def response_text():
    url = 'https://www.gismeteo.ru/weather-yaroslavl-4313/month/'
    return requests.get(url,
                        headers={
                            'user-agent':
                            'Mozilla/5.0 (X11; Linux x86_64) \
                                      AppleWebKit/537.36 (KHTML, like Gecko)\
                                      Chrome/84.0.4147.89 Safari/537.36'
                        }).text


def get_bs4_elements_tag(search_location, name_tag, class_tag):
    return search_location.findAll(name_tag, class_=class_tag)


soup = beautifulsoup(response_text(), 'lxml')
div_tooltip_cell = get_bs4_elements_tag(soup, 'div', 'tooltip cell')
div_tooltip_cell_hover = get_bs4_elements_tag(soup, 'div',
                                              'tooltip cell _hover')


def get_bs4_element_tag(search_location, name_tag, class_tag):
    return search_location.find(name_tag, class_=class_tag)


def parse_date(bs4_element_tag):
    result = get_bs4_element_tag(bs4_element_tag, 'div', 'date')
    if len(result.contents) > 1:
        return result.contents[0].contents[0].string
    else:
        return result.contents[0].string
Example #25
0
#try:
    a = int(input("enter a number"))
    b = int(input("enter another number"))
    ans = a/b
    print(ans)
except ZeroDivisionError as ex:
    print("you cannot divide a number by 0")
except KeyboardInterrupt as key:
    print("\nprogram exited by user")
    #bs4
from bs4 import beautifulsoup
import requests
try:
    page = requests.get("https://twitter.com/adhikaritaiwan")
    soup = beautifulsoup(page.content, 'html.parser')
    all_item = soup.find_all(class_="content")

for content in all_item:
    tweet_text = content.find(class_="js-tweet-text-contains")
    print(tweet_text)
except Exception as ex:
    print("Couldn't connect to twitter")