def getSoup(self): response = R.get(self.strURL, headers={'User-Agent': UserAgent().chrome}) self.soup = BS(response.content, "html.parser")
from requests import Session s = Session() kk = open('t.txt', 'r').read() w = open('link_t_shirt.txt', 'w') kk = kk.split('\n') m = 'http://www.esuppliersindia.com' #kk=['http://www.esuppliersindia.com/suppliers/apparel-fashion/t-shirts/'] for i in kk: # print(i) num = 2 n = '?page_no=' + str(num) r = s.post(i) # print(r) soup = BS(r.content, 'html.parser') num = 2 while (soup.find('title').text): #if(num==420): # num=766 # break; ll = soup.find_all('td', 'bluebg') for j in ll: #print(j) l = j.find('a') #nl.append(r.url+' : '+m+l.get('href')) w.write(r.url + ' : ' + m + l.get('href') + '\n') print(r.url) n = '?page_no=' + str(num) r = s.post(i + n)
def downloadImage(ref, folder, n): html = urllib.request.urlopen(urllib.request.Request(ref, headers=hdr)) soup = BS(html) img = soup.find(id="img").attrs['src'] r = requests.get(img, allow_redirects=True) open("./" + folder + "/" + str(n) + ".jpg", 'wb').write(r.content)
for uni in unis: # getting the page # make url vars "https://www.theuniguide.co.uk/search/course?utf8=%E2%9C%93&c%5Bq%5D=" + str( off_course_name ) + "&c%5Bacademic_years%5D=2021&c%5Binstitution_slug%5D%5B%5D=" + uni.name.lower( ) + "-" + uni.code + "&c%5Bsort%5D=relevance" link = str(link_p) url = requests.get(link) src = url.content # getting the html of page soup = BS(src, "lxml") # making the html parsable # extracting info name = uni.name.title() name_txt = name.replace("-", " ") try: courses_type = soup.find_all("div", {"class": "course-snippets"}) # selecting a Bachelors degree if "B" in courses_type[0].text: course_text = courses_type[0].text num = 0
session = requests.Session() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:47.0) Gecko/20100101 Firefox/47.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' } base_url = 'https://www.work.ua/jobs-kyiv-python/' domain = 'https://www.work.ua' jobs = [] urls = [] urls.append(base_url) req = session.get(base_url, headers=headers) if req.status_code == 200: bsObj = BS(req.content, "html.parser") pagination = bsObj.find('ul', attrs={'class': 'pagination'}) if pagination: pages = pagination.find_all('li', attrs={'class': False}) for page in pages: urls.append(domain + page.a['href']) for url in urls: time.sleep(2) req = session.get(url, headers=headers) if req.status_code == 200: bsObj = BS(req.content, "html.parser") div_list = bsObj.find_all('div', attrs={'class': 'job-link'}) for div in div_list: title = div.find('h2') href = title.a['href']
import bs4 from bs4 import BeautifulSoup as BS from urllib.request import urlopen as ureq uClient = ureq("https://uwaterloo.ca/") thepage = uClient.read() uClient.close() soupdata = BS(thepage, 'lxml') soup = BS("https://uwaterloo.ca/", 'lxml') for img in soup.findAll("img"): temp = img.get("src") if temp[:1] == "/": image = "https://uwaterloo.ca/" + temp else: image = temp nametemp = img.get("alt") if len(nametemp) == 0: filename = str(i) i += 1 else: filename = nametemp imagefile = open(filename + ".jpeg", "wb") imagefile.write(ureq(image).read()) imagefile.close()
def Soup(content): from bs4 import BeautifulSoup as BS return BS(content, "html.parser")
def get_hotel_review(url): print('Get hotel review url.') html = requests.get(url) soup = BS(html.content,'html.parser') page_no = [] temp_url = [] for l1 in soup.findAll('div', {'id':"REVIEWS"}): for l2 in l1.findAll('span', {'class':"pageNum last taLnk "}): page_no.append(l2.get('data-page-number')) print('Review pages: ' + page_no[0]) # Get review link ### for l1 in soup.findAll('div', {'class':"quote"}): container = l1.find('a') temp_url.append('https://www.tripadvisor.com.sg' + container['href']) userReviewURL = [] userReviewURL.append(temp_url[0]) # To loop through all reviews pages for i in range(int(page_no[0])-1): # To use this line when running full scrap (all pages of reviews) #for i in range(2): # To use this line when running partial scrap for debug html = requests.get(userReviewURL[i]) print(userReviewURL[i]) soup = BS(html.content,'html.parser') container = soup.find('a',{'data-page-number':i+2}) urlTemp = 'https://www.tripadvisor.com.sg' + container['href'] userReviewURL.append(urlTemp) # Read review ### print('Reading reviews.') temp = [] uids = [] global names global ratings global dates global titles global bodies global recommendTitles global recommendAnswers names[:] = [] ratings[:] = [] dates[:] = [] titles[:] = [] bodies[:] = [] recommendTitles[:] = [] recommendAnswers[:] = [] for i in range(len(userReviewURL)): html = requests.get(userReviewURL[i]) soup = BS(html.content,'html.parser') container = soup.find('div',{'id':'SHOW_USER_REVIEW'}) print('Parsing url : ' + userReviewURL[i]) for j in range(5): temp = container.findAll('div',{'id':re.compile('^review_')})[j] name = temp.find('span',{'class':re.compile('^expand_inline')}) if(name is None): continue rating = temp.find('span',{'class':re.compile('^ui_bubble_rating')})['class'][1] date = temp.find('span',{'class':'ratingDate'}).next_element if j == 0: title = temp.find('div',{'property':'name'}) body = temp.find('p',{'property':'reviewBody'}) else: title = temp.find('span',{'class':'noQuotes'}) body = temp.find('p',{'id':re.compile('^review_')}) recommendTitle = temp.find('span',{'class':'recommend-titleInline'}) recommendAnswer = temp.findAll('li',{'class':'recommend-answer'}) memberInfo = temp.find('div',{'class':'member_info'}) memberOverlayLink = memberInfo.find('div',{'class':'memberOverlayLink'}) if(memberOverlayLink is not None): uid = memberOverlayLink['id'] print('uid : ' + uid) else: uid = "" if name is not None and len(name) > 0: names.append(name.text) else: names.append('') if rating is not None and len(rating) > 0: ratings.append(rating[7]) else: ratings.append('') if date is not None and len(date) > 0: dates.append(date) else: dates.append('') if title is not None and len(title) > 0: titles.append(title.text) else: titles.append('') if body is not None and len(body) > 0: bodies.append(body.text.strip('\n')) else: bodies.append('') if recommendTitle is not None and len(recommendTitle) > 0: recommendTitles.append(recommendTitle.text) else: recommendTitles.append('') if recommendAnswer is not None and len(recommendAnswer) > 0: jsonTemp = {} for k in range(len(recommendAnswer)): jsonTemp[recommendAnswer[k].text.strip('\n')] = recommendAnswer[k].find('span')['alt'][0] recommendAnswers.append(json.dumps(jsonTemp)) else: recommendAnswers.append('') if uid is not None and len(uid) > 0: uids.append(uid[4:uid.find('-SRC')]) else: uids.append('') write_to_mongoDB("user_review") get_member_profile(uids)
def get_member_profile(uids): print('Get member profile.') print(uids) memberOverlayLink = [] memberProfileURL = [] for i in range(len(uids)): if len(uids[i]) > 0: memberOverlayLink.append('https://www.tripadvisor.com.sg/MemberOverlay?Mode=owa&uid=' + str(uids[i])) memberProfileURL.append('https://www.tripadvisor.com.sg/MemberProfile-a_uid.' + str(uids[i])) else: memberOverlayLink.append('') memberProfileURL.append('') global ageGenders global hometowns global travelStyleTags global points global levels global usernames global ages global genders ageGenders[:] = [] hometowns[:] = [] travelStyleTags[:] = [] points[:] = [] levels[:] = [] usernames[:] = [] ages[:] = [] genders[:] = [] for i in range(len(memberProfileURL)): if(memberProfileURL[i] is not None and len(memberProfileURL[i]) > 0): print('memberProfileURL[i] : ' + memberProfileURL[i]) html = requests.get(memberProfileURL[i]) soup = BS(html.content,'html.parser') container = soup.find('div',{'id':'MODULES_MEMBER_CENTER'}) if(container is not None): ageGender = container.find('div',{'class':'ageSince'}) hometown = container.find('div',{'class':'hometown'}) travelStyleTag = container.findAll('div',{'class':'tagBubble unclickable'}) point = container.find('div',{'class':'points'}) level = container.find('div',{'class':'level tripcollectiveinfo'}) username = container.find('span',{'class':'nameText'}) try: print("username: "******"unable to show username due to unicode text") if len(ageGender) > 0: ageGenders.append(ageGender.text[14:].strip()) splitAgeGenders = ageGender.text[14:].strip().split('old') if(len(splitAgeGenders)==1): ages.append(splitAgeGenders[0]) genders.append('') elif(len(splitAgeGenders)==2): ages.append(splitAgeGenders[0]) genders.append(splitAgeGenders[1]) else: ageGenders.append('') if len(hometown) > 0: hometowns.append(hometown.text) else: hometowns.append('') if len(travelStyleTag) > 0: listTemp = [] for j in range(len(travelStyleTag)): listTemp.append(travelStyleTag[j].text.strip()) travelStyleTags.append(listTemp) else: travelStyleTags.append('') if len(point) > 0: points.append(int(str(point.text.strip()).replace(',',''))) else: points.append('') if level is not None: levels.append(level.text[6:level.text.find(' ',6)].strip()) else: levels.append('') if len(username) > 0: usernames.append(username.text) else: usernames.append('') write_to_mongoDB("member_profile")
def handle(self, *args, **options): post = Post.objects.filter(hide=False, source__icontains='http://chihuo.org/', is_approved=False) headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', # 'Accept-Encoding': 'none', # 'Accept-Language': 'en-US,en;q=0.8', # 'Connection': 'keep-alive', 'referer': 'http://www.yelp.com/', } for p in post: name = p.source.replace('http://chihuo.org/', '') name = name.replace('/', '') name = name.replace('-', '+') keyword = name + '+LA+yelp' google_url = 'https://www.google.com/search?q=%s' % keyword google_soup = BS( urllib2.urlopen(urllib2.Request(google_url, headers=headers)).read()) urls = google_soup.select('h3.r a') yelp_url = None for u in urls: if 'yelp.com' in u['href']: yelp_url = u['href'] break if not yelp_url: print 'Not found YELP' yelp_url = raw_input('Enter Yelp link: ') hot_area = '洛杉矶' soup = BS( urllib2.urlopen(urllib2.Request(yelp_url, headers=headers)).read()) name = soup.find('h1', { 'class': 'biz-page-title' }).get_text().lstrip().rstrip() print name name2 = raw_input('Enter Chinese name if has: ') tags = soup.find('span', {'class': 'category-str-list'}).get_text() print tags tag = raw_input('Enter tags: ') if not tag: return phone = soup.find('span', {'class': 'biz-phone'}).get_text() phone = ''.join([s for s in phone if s.isdigit()]) street1 = soup.find('span', { 'itemprop': 'streetAddress' }).get_text() city_text = soup.find('span', { 'itemprop': 'addressLocality' }).get_text() state_text = soup.find('span', { 'itemprop': 'addressRegion' }).get_text() postcode_text = soup.find('span', { 'itemprop': 'postalCode' }).get_text() map_url = soup.select('a.biz-map-directions img')[0]['src'] query = urlparse(map_url).query query_parsed = parse_qsl(query) lat = None lng = None for q in query_parsed: if q[0] == 'center': center = q[1] lat = center.split(',')[0] lng = center.split(',')[1] try: state = State.objects.get(name=state_text) except: state = State(name=state_text) state.save() try: city = City.objects.get(name=city_text) except: city = City(name=city_text, state=state) city.save() try: postcode = Postcode.objects.get(number=postcode_text) except: postcode = Postcode(number=postcode_text, state=state) postcode.save() try: hot_area = Hot_area.objects.get(name=hot_area) except: hot_area = hot_area(name=hot_area) hot_area.save() business = Business( name=name, name2=name2, phone=phone, street1=street1, city=city, postcode=postcode, latitude=lat, longitude=lng, hot_area=hot_area, ) photo_url = soup.select('.showcase-photo-box img')[0]['src'] ext = mimetypes.guess_extension(mimetypes.guess_type(photo_url)[0]) req = urllib2.Request(photo_url, headers=headers) img_temp = NamedTemporaryFile(delete=True) img_temp.write(urllib2.urlopen(req).read()) img_temp.flush() business.photo.save('%s%s' % (uuid.uuid4(), ext), File(img_temp)) print '--- 确定吗? ---' print p.title, business.name, business.name2 sure = raw_input('确定吗?') if sure in ['y', 'Y', 'yes', 'YES', 'Yes']: business.save() # tag tag = tag.split(' ') for t in tag: try: t_instance = Tag.objects.get(name=t) except: t_instance = Tag(name=t) t_instance.save() business.tag.add(t_instance) p.hide = True p.business = business p.save() print 'Succuess' else: print 'Fail'
# # title: Scraping a section of webpage based on text # url: https://stackoverflow.com/questions/67754320/scraping-a-section-of-webpage-based-on-text/67756231#67756231 import selenium.webdriver from bs4 import BeautifulSoup as BS import time url = 'https://www.flashscore.com/football/chile/primera-division/' driver = selenium.webdriver.Firefox() driver.get(url) time.sleep(5) soup = BS(driver.page_source, 'html.parser') print('--- version 1 ---') section = soup.find('div', id='live-table').find('section') for item in section.find_all('div', title='Click for match detail!'): print(item.get('id')) print('--- version 2 ---') section = soup.find('section', class_='event--live') for item in section.find_all('div', title='Click for match detail!'): print(item.get('id'))
import requests import re from bs4 import BeautifulSoup as BS from nltk.tokenize import RegexpTokenizer from nltk.tokenize import sent_tokenize #cleaning and extracting url = 'http://shakespeare.mit.edu/allswell/full.html' r = requests.get(url) html = r.text soup = BS(html, 'html5lib') text = soup.get_text() blockquote = soup.findAll('blockquote') tokenizer = RegexpTokenizer('\w+[\']*\w*') tokens = tokenizer.tokenize(text) s = '' for block in blockquote: try: lines = block.find_all('a') for line in lines: s += line.text + ' ' except: continue new = " ".join(s.split()) file = open("allswell.txt", 'w')
import pandas as pd import numpy as np from bs4 import BeautifulSoup as BS from bs4 import SoupStrainer dir_str = str(os.getcwd()) #directory = os.fsencode(dir_str) poynt = 0 for file in os.listdir(dir_str): filename = os.fsdecode(file) if filename.endswith('.html'): os.chdir(dir_str) print(filename) with open(filename,'r') as doc: parse = BS(doc,'lxml') for tag in parse.find_all('div',class_='col-xs-12 col-md-8 col-lg-9'): n_rows = 0 n_columns = 0 point = 0 for titles in tag.find_all('div',class_='heading'): point += 1 if point == 1: column_title = [] for columns in titles.find_all('div'): column_title.append(str(columns.text.strip())) n_columns += 1 column_title.append('Beer Styles') column_title.insert(2,'Maximum ' + str(column_title[1])) column_title[1] = 'Minimum ' + str(column_title[1]) #print(column_title)
import requests from bs4 import BeautifulSoup as BS url = 'https://producthunt.com' r = requests.get(url) html = r.text soup = BS(html, 'html.parser') for first in soup.find_all("main", id="app"): for second in first.find_all("div"): for third in second.find_all("div"): for a in first.find_all("div", class_="content_1mxGg"): for div in a.find_all("div", class_="legacyPosts_3AHFn"): for ul in div.find_all("ul", class_="postsList_3n2Ck"): for li in ul.find_all("li"): for s in li.find_all("div", class_="postItem_RepXj"): for p in s.find_all("div", class_="content_3Qj0y"): for q in p.find_all( "span", class_= "title_24w6f featured_2PenR default_25TkV base_3LqBu" ): print q.text """b=a.find_all(")[0] l= b.find_all("ul",class_="postsList_3n2Ck")[0]
def appleipad(message): try: if message.text == 'ipad 7 10.2"': Rozetkaipad_7 = requests.get( 'https://rozetka.com.ua/apple-ipad-7-10-2-a2197-wi-fi-32gb-2019-16127127/g22946813/' ) Rozetkaipad_7 = Rozetkaipad_7.text Rozetkaipad_7 = BS(Rozetkaipad_7, 'html.parser') Rozetkaipad_7 = Rozetkaipad_7.find( 'p', {'class': "product-prices__big"}) Rozetkaipad_7 = list(Rozetkaipad_7) Rozetkaipad_7keyboard = types.InlineKeyboardMarkup() Rozetkaipad_7button = types.InlineKeyboardButton( text=RozetkaButtonText, url= 'https://rozetka.com.ua/apple-ipad-7-10-2-a2197-wi-fi-32gb-2019-16127127/g22946813/' ) Rozetkaipad_7keyboard.add(Rozetkaipad_7button) Fixeripad_7 = requests.get( 'https://fixer.com.ua/pda/apple-a2197-ipad-10.2-wi-fi-32gb-space-grey-mw742rk-a_976052.html' ) Fixeripad_7 = Fixeripad_7.text Fixeripad_7 = BS(Fixeripad_7, 'html.parser') Fixeripad_7 = Fixeripad_7.find('div', {'class': "pi_price"}) Fixeripad_7 = str(Fixeripad_7) Fixeripad_7 = list(Fixeripad_7) Fixeripad_7keyboard = types.InlineKeyboardMarkup() Fixeripad_7button = types.InlineKeyboardButton( text=fixerbuttontext, url= 'https://fixer.com.ua/pda/apple-a2197-ipad-10.2-wi-fi-32gb-space-grey-mw742rk-a_976052.html' ) Fixeripad_7keyboard.add(Fixeripad_7button) Brainipad_7 = requests.get( 'https://brain.com.ua/ukr/Planshet_Apple_A2197_iPad_102_Wi-Fi_32GB_Silver_MW752RK_A-p638525.html' ) Brainipad_7 = Brainipad_7.text Brainipad_7 = BS(Brainipad_7, 'html.parser') Brainipad_7 = Brainipad_7.find('div', {'class': "br-pr-np"}) Brainipad_7 = str(Brainipad_7) Brainipad_7 = list(Brainipad_7) Brainipad_7keyboard = types.InlineKeyboardMarkup() Brainipad_7button = types.InlineKeyboardButton( text=brainbuttontext, url= 'https://brain.com.ua/ukr/Planshet_Apple_A2197_iPad_102_Wi-Fi_32GB_Silver_MW752RK_A-p638525.html' ) Brainipad_7keyboard.add(Brainipad_7button) bot.send_message(message.chat.id, 'Rozetka - ' + Rozetkaipad_7[0] + ' ₴', reply_markup=Rozetkaipad_7keyboard) bot.send_message(message.chat.id, 'Fixer - ' + Fixeripad_7[114] + Fixeripad_7[115] + ' ' + Fixeripad_7[116] + Fixeripad_7[117] + Fixeripad_7[118] + ' ₴', reply_markup=Fixeripad_7keyboard) bot.send_message(message.chat.id, 'Brain - ' + Brainipad_7[139] + Brainipad_7[140] + ' ' + Brainipad_7[141] + Brainipad_7[142] + Brainipad_7[143] + ' ₴', reply_markup=Brainipad_7keyboard) except: bot.send_message(message.chat.id, 'Произошла ошибка')
def getMatches(team = ''): url = "http://www.espncricinfo.com/ci/engine/match/index.html?view=live" r = requests.get(url) soup = BS(r.text, "html.parser") tableHeads = soup.find_all('div', {'class' : 'match-section-head'}) tableData = soup.find_all('section', {'class' : 'matches-day-block'}) team_matches = [] if team == '': print "\nHere are the events going on live right now: " for ix in range(0, len(tableHeads)): print "\t" + str(ix+1) + ". " + str(tableHeads[ix].h2.text) try: ch = raw_input("\nChoose the event for which you wish to check out the matches (Enter 0 to See All; -1 to exit): ") ch = int(ch) if ch == -1: sys.exit("Hope you had fun. Have a great day ahead!") temp = tableData[ch - 1] or (ch == 0) except (IndexError, ValueError): print 'Please enter a valid integer between -1 and ' + str(len(tableData)) + '.' askForExit() else: ch = 0 if ch > 0: matches = tableData[ch-1].find_all('section', {'class' : 'default-match-block'}) else: matches = tableData[0].find_all('section', {'class' : 'default-match-block'}) for ix in range(1, len(tableData)): matches = matches + tableData[ix].find_all('section', {'class':'default-match-block'}) for ix in range(0,len(matches)): matchDetails = matches[ix].find_all('div') team1 = str(matchDetails[1].text.split('\n',1)[1].split(' ')[0]) if len(str(matchDetails[1].text.split('\n',1)[1].split(' ')[1]))>0: team1 = team1 + " " + str(matchDetails[1].text.split('\n',1)[1].split(' ')[1]) score1 = str(matchDetails[1].find('span').text) if len(str(matchDetails[1].text.split('\n',1)[1].split(' ')[2]))>0: team1 = team1 + " " + str(matchDetails[1].text.split('\n',1)[1].split(' ')[2]) score2 = str(matchDetails[2].find('span').text) team2 = str(matchDetails[2].text.split('\n',1)[1].split(' ')[0]) if len(str(matchDetails[2].text.split('\n',1)[1].split(' ')[1]))>0: team2 = team2 + " " + str(matchDetails[2].text.split('\n',1)[1].split(' ')[1]) if len(str(matchDetails[2].text.split('\n',1)[1].split(' ')[2]))>0: team2 = team2 + " " + str(matchDetails[2].text.split('\n',1)[1].split(' ')[2]) headerline = "Match " + str(ix+1) + ": " + team1 + " vs " + team2 if len(headerline)<40: headerline += (" " * (40 - len(headerline))) if team in ['', team1.lower(), team2.lower()]: team_matches.append(ix+1) print "\n" + headerline + "\t\t(" + str(matchDetails[0].find('span', {'class':'bold'}).text) +")" print str(matchDetails[0].find('span', class_='match-no').a.text.split(' ',1)[1]) print "\t" + team1 + ": " + score1 + "\n\t" + team2 + ": " + score2 print "\n" + matchDetails[3].text.split('\n')[1] print "_"*50 if len(team_matches) == 0 and team != '': print 'Sorry! No match found for team ' + team + '.' getMatches('') try: if len(team_matches) == 1: ch = team_matches[0] else: ch = raw_input("\nChoose the event for which you wish to see the whole scorecard (Enter -1 to Exit; 0 for previous menu): ") ch = int(ch) if ch == -1: sys.exit("Hope you had fun. Have a great day ahead!") if ch == 0: getMatches(' ') temp = matches[ch - 1] except (IndexError, ValueError): print 'Please enter a valid integer between -1 and ' + str(len(matches)) + '.' askForExit() url2 = "http://www.espncricinfo.com" + matches[ch-1].find_all('div')[4].find_all('a')[0]['href'] + "?view=scorecard" matchDetails = matches[ch-1].find_all('div') team1 = str(matchDetails[1].text.split('\n',1)[1].split(' ')[0]) if len(str(matchDetails[1].text.split('\n',1)[1].split(' ')[1]))>0: team1 = team1 + " " + str(matchDetails[1].text.split('\n',1)[1].split(' ')[1]) score1 = str(matchDetails[1].find('span').text) if len(str(matchDetails[1].text.split('\n',1)[1].split(' ')[2]))>0: team1 = team1 + " " + str(matchDetails[1].text.split('\n',1)[1].split(' ')[2]) score2 = str(matchDetails[2].find('span').text) team2 = str(matchDetails[2].text.split('\n',1)[1].split(' ')[0]) if len(str(matchDetails[2].text.split('\n',1)[1].split(' ')[1]))>0: team2 = team2 + " " + str(matchDetails[2].text.split('\n',1)[1].split(' ')[1]) if len(str(matchDetails[2].text.split('\n',1)[1].split(' ')[2]))>0: team2 = team2 + " " + str(matchDetails[2].text.split('\n',1)[1].split(' ')[2]) meta = "\t" + team1 + ": " + score1 + "\n\t" + team2 + ": " + score2 meta += "\n\n" + matchDetails[3].text.split('\n')[1]
def IPhonxs(message): if message.text == 'Iphone Xs 64GB': try: RozetkaAppleiPhoneXs64GB = requests.get( 'https://rozetka.com.ua/apple_iphone_xs_64gb_space_gray/p172930640/' ) RozetkaAppleiPhoneXs64GB = (RozetkaAppleiPhoneXs64GB.text) RozetkaAppleiPhoneXs64GB = BS(RozetkaAppleiPhoneXs64GB, 'html.parser') RozetkaAppleiPhoneXs64GB = RozetkaAppleiPhoneXs64GB.find( 'p', {'class': 'product-prices__big product-prices__big_color_red'}) RozetkaAppleiPhoneXs64GB = list(RozetkaAppleiPhoneXs64GB) RozetkaAppleiPhoneXs64GB = RozetkaAppleiPhoneXs64GB[0] RozetkaAppleiPhoneXs64GBkeyboard = types.InlineKeyboardMarkup() RozetkaAppleiPhoneXs64GBbutton = types.InlineKeyboardButton( text=RozetkaButtonText, url= 'https://rozetka.com.ua/apple_iphone_xs_64gb_space_gray/p172930640/' ) RozetkaAppleiPhoneXs64GBkeyboard.add( RozetkaAppleiPhoneXs64GBbutton) FokstrotAppleiPhoneXs64GB = requests.get( 'https://www.foxtrot.com.ua/ru/shop/mobilnye_telefony_apple_iphone-xs-64gb-silver.html?utm_medium=cpc&utm_source=hotline&utm_campaign=DIG-mobilephohe&utm_term=mobilnye_telefony_apple_iphone-xs-64gb-silver&utm_content=6423074' ) FokstrotAppleiPhoneXs64GB = (FokstrotAppleiPhoneXs64GB.text) FokstrotAppleiPhoneXs64GB = BS(FokstrotAppleiPhoneXs64GB, 'html.parser') FokstrotAppleiPhoneXs64GB = FokstrotAppleiPhoneXs64GB.find( 'div', {'class': "card-price"}) FokstrotAppleiPhoneXs64GB = list(FokstrotAppleiPhoneXs64GB) FokstrotAppleiPhoneXs64GB = FokstrotAppleiPhoneXs64GB[0] FokstrotPhoneXs64GBkeyboard = types.InlineKeyboardMarkup() FokstrotPhoneXs64GBbutton = types.InlineKeyboardButton( text=FokstrotButtonText, url= 'https://www.foxtrot.com.ua/ru/shop/mobilnye_telefony_apple_iphone-xs-64gb-silver.html?utm_medium=cpc&utm_source=hotline&utm_campaign=DIG-mobilephohe&utm_term=mobilnye_telefony_apple_iphone-xs-64gb-silver&utm_content=6423074' ) FokstrotPhoneXs64GBkeyboard.add(FokstrotPhoneXs64GBbutton) buyAppleiPhoneXs64GB = requests.get( 'http://www.buy.ua/shop/1400215/1400417/1712920.html') buyAppleiPhoneXs64GB = (buyAppleiPhoneXs64GB.text) buyAppleiPhoneXs64GB = BS(buyAppleiPhoneXs64GB, 'html.parser') buyAppleiPhoneXs64GB = buyAppleiPhoneXs64GB.find( 'div', {'class': "price-info"}) buyAppleiPhoneXs64GB = str(buyAppleiPhoneXs64GB) buyAppleiPhoneXs64GB = list(buyAppleiPhoneXs64GB) buyPhoneXs64GBkeyboard = types.InlineKeyboardMarkup() buyPhoneXs64GBbutton = types.InlineKeyboardButton( text=buyButtonText, url='http://www.buy.ua/shop/1400215/1400417/1712920.html') buyPhoneXs64GBkeyboard.add(buyPhoneXs64GBbutton) bot.send_message(message.chat.id, 'Rozetka - ' + RozetkaAppleiPhoneXs64GB + ' ₴', reply_markup=RozetkaAppleiPhoneXs64GBkeyboard) bot.send_message(message.chat.id, 'Фокстрот - ' + FokstrotAppleiPhoneXs64GB + ' ₴', reply_markup=FokstrotPhoneXs64GBkeyboard) bot.send_message(message.chat.id, 'Buy.ua - ' + str(buyAppleiPhoneXs64GB[55]) + str(buyAppleiPhoneXs64GB[56]) + ' ' + str(buyAppleiPhoneXs64GB[57]) + str(buyAppleiPhoneXs64GB[58]) + str(buyAppleiPhoneXs64GB[59]) + ' ₴', reply_markup=buyPhoneXs64GBkeyboard) except: bot.send_message(message.chat.id, 'Произошла ошибка')
combinedDF['highestWord'].values[i] = individualWordsResult[ 'highestWord'] combinedDF['highestWordCount'].values[i] = individualWordsResult[ 'highestWordCount'] combinedDF['lowestWord'].values[i] = individualWordsResult[ 'lowestWord'] combinedDF['lowestWordCount'].values[i] = individualWordsResult[ 'lowestWordCount'] if result > 0: # it exists (less common case), now find out where print("Checking for [" + needle + "] in DOM tags of " + haystack) soup = BS(lowerpagetext) combinedDF['KeywordFoundinHTags'].values[i] = find_by_text( soup, needle, 'h1') + find_by_text( soup, needle, 'h2') + find_by_text(soup, needle, 'h3') combinedDF['KeywordFoundinTitle'].values[ i] = soup.title.string.lower().count(lowerneedle) with pd.ExcelWriter(name + '.xlsx') as writer: combinedDF.to_excel(writer, sheet_name='data') print("finished and outputed to excel file") else: print("nothing found")
def futureQuote(self): self.driverf.get('http://info512.taifex.com.tw/Future/FusaQuote_Norl.aspx') time.sleep(0.1) soup = BS(self.driverf.page_source,'lxml') self.future_table = pd.read_html(str(soup.select('#divDG')[0]),header=0)[0]
def main(handle, code_list_file, code_path): with open(code_list_file, "r", encoding="utf-8") as f: codeList = json.load(f) num = 1 count = 100 while (True): conn = get_response("https://codeforces.com/api/user.status?", f"handle={handle}&from={num}&count={count}") data = conn.read() conn.close() res = json.loads(data.decode("utf-8")) print(num) if res["status"] == "OK": results = res["result"] if results == []: break for result in results: if result['verdict'] != "OK": continue contestId = result['problem']['contestId'] contestIdx = result['problem']['index'] solutionId = result['id'] language = result['programmingLanguage'] fName = str(contestId) + str(contestIdx) if fName not in codeList: codeList[fName] = [solutionId] else: if solutionId in codeList[fName]: continue else: codeList[fName].append(solutionId) fName += "_" + str(len(codeList[fName])) solutionApi = f'https://codeforces.com/contest/{contestId}/submission/{solutionId}' conn = get_response(solutionApi) bs_res = BS(conn, 'html.parser') conn.close() code = bs_res.find('pre').text.strip() code = re.sub("\r\n", "\n", code) fName += file_type[language] remark = make_remark( result['problem']['name'], handle, datetime.datetime.fromtimestamp( result['creationTimeSeconds']).isoformat(), language) code = remark + code try: with open(code_path + fName, "w", encoding="utf-8") as f: f.write(code) except: print(code_path + "가 존재하지 않거나 파일이 열려있는 상태입니다.") sys.exit(1) time.sleep(random.uniform(1, 2)) num += count try: with open(code_list_file, "w", encoding="utf-8") as f: json.dump(codeList, f) except: print(code_list_file + "를 다시 작성중 오류가 발생했습니다.") sys.exit(1)
""" Code Challenge 4 Scrap the data from the URL below and store in sqlite database https://www.icc-cricket.com/rankings/mens/team-rankings/odi """ from bs4 import BeautifulSoup as BS import requests url = "https://www.icc-cricket.com/rankings/mens/team-rankings/odi" source = requests.get(url).text soup = BS(source, 'lxml') print(soup.prettify()) right_table = soup.find('table', class_='table') print(right_table.prettify()) A = [] B = [] C = [] D = [] E = [] for i in right_table.findAll('tr'): rows = i.findAll('td') if len(rows) == 5: A.append(rows[0].text.strip())
def make_request(url): page = requests.get(url) soup = BS(page.text, 'html.parser') return soup
""" Author: https://github.com/Romawechka Python version 3.8.5 """ import requests from bs4 import BeautifulSoup as BS if __name__ == "__main__": req = requests.get('http://www.cbr.ru/scripts/XML_daily.asp') xml = BS(req.content, 'lxml') for el in xml.findAll('valute'): Gon_dollar = str(el.find('name')).replace('/', '').replace('<name>', '') if Gon_dollar == 'Гонконгских долларов': value = str(el.find('value')).replace('/', '').replace('<value>', '') char = str(el.find('charcode')).replace('/', '').replace( '<charcode>', '') print(f'1 {char} = {value} Rub') break inp = input( '\nIf you want to exit enter any message or just close the application.\n' ) exit()
utterance will begin with NO NON-DOM. If there is non-dominant hand gloss in the utterance there will be **NON-DOM** followed by the non-dominant hand gloss.""" from bs4 import BeautifulSoup as BS import re partial_path = """<write the path to "ncslgr-xml">""" # Write the path location where ncslgr-xml is saved on your local machine dominant_only_gloss = () dominant_and_non_dominant_gloss = () with open(partial_path + r'\close call.xml', 'r') as f_IN: with open( """Path name to file output""", 'a' ) as f_OUT_utts: # Write path to the file name you want to use to save the output to soup = BS(f_IN.read(), 'xml') for utterance_tag in soup.find_all('UTTERANCES'): for utterance_tags in utterance_tag.find_all('UTTERANCE'): if utterance_tags.find_all('TRACK', {'FID': '10001'}): for dominant_track_tags in utterance_tags.find_all( 'TRACK', {'FID': '10000'}): for dominant_a_tags in dominant_track_tags.find_all( 'A'): if dominant_a_tags.has_attr('VID'): dominant_a_tags.decompose() for non_dominant_track_tags in utterance_tags.find_all( 'TRACK', {'FID': '10001'}): for non_dominant_a_tags in non_dominant_track_tags.find_all( 'A'): if non_dominant_a_tags.has_attr('VID'):
def price_guide(item, max_cost_quantile=None): """Fetch pricing info for an item""" results = [] if (item['ItemTypeID'] == 'P' and 'stk0' in item['ItemID']) or \ item['ItemTypeID'] == 'S' or \ item['ItemTypeID'] == 'M': # a sticker sheet, a set, or a minifigure color_ids = [0] else: # a normal item color_ids = color.similar_to(item['ColorID']) for c in color_ids: # perform HTTP request parameters = { 'itemType': item['ItemTypeID'], 'itemNo': item['ItemID'], 'itemSeq': 1, 'colorId': c, 'v': 'P', 'priceGroup': 'Y', 'prDec': 2 } url = "http://www.bricklink.com/catalogPG.asp?" + urllib.urlencode( parameters) html = urllib.urlopen(url).read() # parse page page = BS(html) if len(page.find_all(text='Currently Available')) == 0: # not available in this color :( continue else: # newly found inventory new = [] for td in page.find_all('td'): if td.find('a', recursive=False, href=re.compile('/store.asp')) is not None: # find the td element with a link to a store. Its siblings contain # the interesting bits like price and quantity available store_url = td.find('a')['href'] store_id = int(utils.get_params(store_url)['sID']) quantity = int(td.next_sibling.text) cost_per_unit = float( re.findall('[0-9.]+', td.next_sibling.next_sibling.text)[0]) new.append({ 'item_id': item['ItemID'], 'wanted_color_id': item['ColorID'], 'color_id': c, 'store_id': store_id, 'quantity_available': quantity, 'cost_per_unit': cost_per_unit }) # remove items that cost too much if max_cost_quantile is not None and max_cost_quantile < 1.0: observed_prices = [ e['quantity_available'] * [e['cost_per_unit']] for e in new ] observed_prices = list(sorted(utils.flatten(observed_prices))) if len(observed_prices) > 0: i = utils.quantile( len(observed_prices) - 1, max_cost_quantile) max_price = observed_prices[i] new = filter(lambda x: x['cost_per_unit'] <= max_price, new) # add what's left to the considered inventory results.extend(new) if sum(e['quantity_available'] for e in results) >= item['Qty']: # stop early, we've got everything we need return results return results
##used regular expressions to remove the tags from each line def removeTag(raw_text): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_text) return cleantext ##Pull in the scoreboard and parse through it try: page = requests.get('http://scoreboard.uscyberpatriot.org') except: print("[!]Error: Webpage is unavailable...") sys.exit() html = BS(page.content, 'html.parser') ##Set up for the Excel file book = xlwt.Workbook() sheetName = str(input("What round of competition is it?(ex. round1): ")) sheet = book.add_sheet(sheetName) cols = [ "Placement", "Team Number", "Location", "Division", "Teir", "Scored Images", "Play Time", "Current Score" ] row = sheet.row(0) for index, col in enumerate(cols): row.write(index, col) print("~" * 15 + "Starting program" + "~" * 15)
import urllib2 from bs4 import BeautifulSoup as BS import json link = 'https://www.cbssports.com/nfl/stats/playersort/' \ 'nfl/year-2017-season-regular-category-touchdowns' data = urllib2.urlopen(link) soup = BS(data.read(), "html.parser") def FStat(): data_list = [] fhandler = soup.find_all(class_={'row1', 'row2'}) for tdata in fhandler[:20]: try: player_name = tdata.contents[0].get_text() player_position = tdata.contents[1].get_text() player_team = tdata.contents[2].get_text() touch_downs = tdata.contents[6].get_text() # defining json strings json_string = { "Name": player_name, "Position": player_position, 'Team': player_team, "Touchdowns": touch_downs } print json.dumps(json_string) except: print 'Please double check the data, something went wrong'
import requests from bs4 import BeautifulSoup as BS import re session = requests.session() page = session.post('https://yts.ag/') #Home Page details #Popular Movies popular_movies = BS(page.text,"html.parser").find('div',{'id':'popular-downloads'}) ##print(popular_movies) popular_movies_rating = popular_movies.find_all('h4',{'class':'rating'}) popular_movies_torrents = popular_movies.find_all('a',{'rel':'nofollow'}) tempList = popular_movies.find_all('div',{'class':'browse-movie-tags'}) tempList_torrents =[] for i in tempList: torrents = i.find_all('a',{'rel':'nofollow'}) if (len(torrents) == 2): tempList_torrents.append({'720p':(re.search('(.+)',torrents[0]['href']).group()),'1080p':(re.search('(.+)',torrents[1]['href']).group())}) else : tempList_torrents.append({'720p':(re.search('(.+)',torrents[0]['href']).group()),'1080p':''}) popular_movies = popular_movies.find_all('a',{'class':'browse-movie-title'}) ##print(popular_movies) ##print('\n')
sql = "INSERT INTO scrapydb.cleantriple(subject, subject_alternative, predicate, object,repository) VALUES(%s,%s,%s,%s,%s)" values = (s, sa, p, o, repository) myac.execute(sql, values) mydb.commit() #------------------- TO get Row from DB --------------------- source = 'Skill Commons' myac.execute( 'select * from triplesSC where predicate = "hasRaw" and process = 0;') rows = myac.fetchall() nRows = len(rows) for r in rows: subjectOer = 'oer' metadata_tb = BS(r[3], 'html.parser') #title titleTB = metadata_tb.find('h1').get_text() subjectOer = r[6] saveTriple(subjectOer, '', 'title', titleTB, source) #authors try: domAuthors = metadata_tb.find('ul', { 'class': 'authors' }).find_all('li') for li in domAuthors: author = li.get_text() saveTriple(subjectOer, '', 'author', author, source) except Exception as e:
from bs4 import BeautifulSoup as BS from pycldf import StructureDataset, Source import json # load languages with open('raw/languages.json') as f: langs = json.load(f) soup = BS(open('raw/42 Chinese dialects.kml').read(), 'xml') formtable, languagetable, parametertable = [], [], [] for p in soup.findAll('Placemark'): name = ' '.join([c.contents[0] for c in p.findAll('name')]) idx = ' '.join([c.contents[0] for c in p.findAll('description')]) if name in langs: gcode = langs[name]['glottolog'] else: gcode = '' coords = p.findAll('coordinates')[0].contents[0].replace('\n', '') if len(coords.split(',')) == 3: lon, lat, _ = coords.split(',') languagetable += [{ 'ID': idx.strip(), 'Name': name.strip(), 'Glottocode': gcode, 'Latitude': lat.strip(), 'Longitude': lon.strip(), }] with open('raw/Parameters.tsv') as f: tmp = f.readlines()