def pagination(): result_csv = pd.read_csv('pagewise_results.csv') # get the parameters parameters = pd.read_csv('parameters.csv') location = parameters['Location'][0] searched_tweet = parameters['Searched'][0] last_pg = parameters['Pages'][0] if pd.isnull(location): location = "" # Get the current page as the argument in URL pg = request.args.get('page', default="1", type=str) either_arrows = False # Parse the searched.html file for updating the new table if (os.path.exists('templates/searched_pg_left_arrow_pagination.html') and os.path.exists( 'templates/searched_pg_right_arrow_pagination.html')): time_left = os.path.getmtime( 'templates/searched_pg_left_arrow_pagination.html') time_right = os.path.getmtime( 'templates/searched_pg_right_arrow_pagination.html') either_arrows = True if time_left > time_right: print('Both exists - left') f = open('templates/searched_pg_left_arrow_pagination.html', encoding='utf-8').read() soup = Soup(f, features="html.parser") else: print('Both exists - right') f = open('templates/searched_pg_right_arrow_pagination.html', encoding='utf-8').read() soup = Soup(f, features="html.parser") elif (os.path.exists('templates/searched_pg_left_arrow_pagination.html')): print('left exist') either_arrows = True f = open('templates/searched_pg_left_arrow_pagination.html', encoding='utf-8').read() soup = Soup(f, features="html.parser") elif (os.path.exists('templates/searched_pg_right_arrow_pagination.html')): print('right exist') either_arrows = True f = open('templates/searched_pg_right_arrow_pagination.html', encoding='utf-8').read() soup = Soup(f, features="html.parser") else: print('none exist') f = open('templates/searched_' + searched_tweet + '_' + location + '.html', encoding='utf-8').read() soup = Soup(f, features="html.parser") print('current_pg: ', pg) p = soup.find("p", {"class": "searched_for"}) arrow_clicked = "" if 'left' in pg or 'right' in pg: if 'right' in pg: arrow_clicked = "right_arrow_pagination" a = soup.find("a", {"id": "left_arrow_pagination"}) current_pg = soup.find("a", {"id": pg}) current_pg = current_pg.previous_sibling.previous_sibling pg_no = int(current_pg.text) for s in soup.find_all("a", {"class": "inactive"}): s.decompose() for s in soup.find_all("a", {"class": "active"}): s.decompose() if (pg_no != last_pg): for i in range(pg_no + 1, pg_no + 21): pages = soup.new_tag("a") if (i == pg_no + 1): pages['class'] = 'active' else: pages['class'] = 'inactive' pages['id'] = i pages['onclick'] = "redirectPage(this.id)" pages.string = str(i) a.insert_after(pages) a = pages pg = pg_no + 1 elif 'left' in pg: if (either_arrows == False): return render_template('searched_' + searched_tweet + '_' + location + '.html') arrow_clicked = "left_arrow_pagination" a = soup.find("a", {"id": "left_arrow_pagination"}) current_pg = soup.find("a", {"id": pg}) current_pg = current_pg.next_sibling pg_no = int(current_pg.text) for s in soup.find_all("a", {"class": "inactive"}): s.decompose() for s in soup.find_all("a", {"class": "active"}): s.decompose() if (pg_no != 1): for i in range(pg_no - 20, pg_no): pages = soup.new_tag("a") if (i == pg_no - 20): pages['class'] = 'active' else: pages['class'] = 'inactive' pages['id'] = i pages['onclick'] = "redirectPage(this.id)" pages.string = str(i) a.insert_after(pages) a = pages pg = pg_no - 20 else: for i in range(1, 21): pages = soup.new_tag("a") if (i == 1): pages['class'] = 'active' else: pages['class'] = 'inactive' pages['id'] = i pages['onclick'] = "redirectPage(this.id)" pages.string = str(i) a.insert_after(pages) a = pages pg = 1 # file = open('templates/searched_pg_' + str(pg) + '.html', "w", encoding="utf-8") # file.write(str(soup)) # file.close() # return render_template('searched_pg_' + str(pg) + '.html') # If location is empty, then delete previous table results if location == "": # Remove the table tag for previous page results from the html file. for s in soup.select('table'): s.extract() tweets_per_pg = 30 # Otherwise, delete other results table and display data for next page else: for s in soup.find_all("table", {"class": "other_results"}): s.decompose() tweets_per_pg = 15 if arrow_clicked == "": # Make the previous page class as inactive a = soup.find("a", {"class": "active"}) a["class"] = "inactive" # Make the current page (pg) class as active a = soup.find("a", {"id": pg}) a['class'] = "active" pg = int(pg) if (pg == last_pg): # Get remaining results from result_csv show_results = result_csv.loc[(pg - 1) * tweets_per_pg + 1:] else: # Get only 30 results from result_csv depending upon the page number show_results = result_csv.loc[(pg - 1) * tweets_per_pg + 1:pg * tweets_per_pg] result = Soup(show_results.to_html(), features="html.parser") result.find("tr")['style'] = 'text-align:center;' # Make URLs as hyperlinks count = 0 insert = 3 for td in result.find_all("td"): count += 1 if (count == insert): if (td.text != "Not Available"): a = soup.new_tag("a") a["href"] = td.text a.string = td.text td.string = "" td.append(a) insert += 4 if count == insert - 2: td['style'] = "width:12%;" table = result.find("table") table['border'] = '0' if location != "": table["class"] = "other_results" table[ 'style'] = 'position:absolute;top:800px;padding-left:35px;padding-right:35px;text-align:center;' p = soup.find("p", {"class": "other_results_para"}) else: table[ 'style'] = 'position:absolute;top:180px;padding-left:35px;padding-right:35px;text-align:center;' p.insert_after(table) if (arrow_clicked == ""): file = open('templates/searched_pg_' + str(pg) + '.html', "w", encoding="utf-8") file.write(str(soup)) file.close() return render_template('searched_pg_' + str(pg) + '.html') else: file = open('templates/searched_pg_' + arrow_clicked + '.html', "w", encoding="utf-8") file.write(str(soup)) file.close() return render_template('searched_pg_' + arrow_clicked + '.html')
def soup(self) -> Soup: """Converts string data from File into a BeautifulSoup object. Returns: Soup -- BeautifulSoup object created from the File. """ return Soup(self.data)
import mechanize, os from bs4 import BeautifulSoup as Soup url = input("enter url:") browser = mechanize.Browser() browser.set_handle_equiv(True) browser.set_handle_redirect(True) browser.set_handle_referer(True) browser.set_handle_robots(False) browser.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] html = browser.open(url) browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) print(browser.geturl()) soup = Soup(html, "html.parser") image_tags = soup.findAll('img') i = 0 for image in image_tags: i = i + 1 filename = image['src'] print(filename) filename = os.path.join(os.getcwd(), str(i)) data = browser.open(image['src']).read() savename = (str(i) + '.jpg') save = open(savename, 'wb') save.write(data) save.close()
from string import punctuation, whitespace import urllib2 import datetime import re from bs4 import BeautifulSoup as Soup import csv today = datetime.date.today() html = urllib2.urlopen("http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html").read() soup = Soup(html) p = re.compile(r'<.*?>') aslink = soup.findAll('span',attrs={'class':'ac'}) for li in soup.findAll('li', attrs={'class':'g'}): sLink = li.find('a') sSpan = li.find('span', attrs={'class':'st'}) print sLink['href'][7:] , "," + p.sub('', str(sSpan)).replace('.','') for adli in soup.findAll('div',attrs={'id':'rhs_block'}): adlink = adli.find('a') print adlink['href'] print p.sub('', str(aslink)[1:-1]).replace('.','\n') #for ads in soup.findAll('div',{'id':'tads'}):
# IDE : PyCharm # description : # 我们在操作电脑时,经常会需要操作电脑上的文件/文件夹 # 现在我们来学一学怎么用python来批量地操作文件 # 我们把上节课通过爬虫爬取到的内容存到我们电脑上的一个文件里面 # from bs4 import BeautifulSoup as Soup import requests # 大家平时一定都喜欢看书,今天我们就来一起来爬一爬豆瓣top250的图书 url = 'https://book.douban.com/top250?start=0' r = requests.get(url) html_code = r.text soup = Soup(html_code, "html.parser") all_book = soup.find_all('table') count = 0 # 首先我们先将上节课爬到的内容记录到一个字典变量里面 books = {} for book in all_book[1:]: count = count + 1 # print('{:*^30}'.format('book',count)) # print(book.div.a['title']) # print(book.p.string) # print(book.find(name='span', attrs={'class': 'inq'}).string) title = book.div.a['title'] content = book.p.string intro = book.find(name='span', attrs={'class': 'inq'}).string books[title] = {'content': content, 'intro': intro} # 记录下来
from bs4 import BeautifulSoup as Soup html = "output1.html" soup = Soup(open(html), "html.parser") mydivs = soup.find("div", {"id": "legend"}) print(soup)
def __init__(self, url, *args, **kwargs): super().__init__(*args, **kwargs) self.post = Soup(self.scraper.get(url).content, 'html5lib')
from bs4 import BeautifulSoup as Soup import pickle fileName = "senticon.es.xml" pol = dict() with open(fileName, "r") as f: soup = Soup(f.read(), "xml") for lemma in soup.find_all("lemma"): pol[(lemma.get_text().strip(), lemma["pos"])] = float(lemma["pol"]) print(pol) with open("polarity.pkl", "wb") as f: pickle.dump(pol, f, pickle.HIGHEST_PROTOCOL)
def test_metadata_json_html(app_client): response = app_client.get("/-/metadata") assert response.status == 200 pre = Soup(response.body, "html.parser").find("pre") assert METADATA == json.loads(pre.text)
return program else: for path in os.environ["PATH"].split(os.pathsep): exe_file = os.path.join(path, program) if is_exe(exe_file): return exe_file return None ############################################################ # Globals ############################################################ # WEBSITE = 'http://shinsekai.cadet-nine.org/' ROOT = pjoin(os.path.split(os.path.abspath(__file__))[0], '..') soup = Soup('', 'lxml') TEMPLATE_FOLDER = pjoin(ROOT, "book_templates/epub") ############################################################ # EBook Class ############################################################ class EBook: TEMPLATE_FOLDER = pjoin(ROOT, "book_templates/epub") """An ebook basically consists of a bunsh of html files usually one pr chapter and a table of content that describes the relationship between the chapters""" def __init__(self, url, out_file='book.epup', title=None, workers=5,
async def search_task(self, site, page, search): """taches de recherche""" async with site["semaphore"]: logger.debug("%s %i - in search_task: search '%s'", self.name, self._episode, search.data) content = None try: page = await page.browser.newPage() if self.canceled: raise asyncio.CancelledError await page.setViewport({"width": 1920, "height": 1080}) if self.canceled: raise asyncio.CancelledError await self._going_to(page, site["going_search"]) logger.debug("%s %i - in search_task: search.data = '%s'", self.name, self._episode, search.data) if self.canceled: raise asyncio.CancelledError await search.run(page) if self.canceled: raise asyncio.CancelledError await self._screenshot(page) if self.canceled: raise asyncio.CancelledError await site["search_button_selector"].run(page) if self.canceled: raise asyncio.CancelledError # sleep(10) if site["search_response_selector"] is not None: await site["search_response_selector"].run(page) if self.canceled: raise asyncio.CancelledError content = await page.content() # return HTML document # print(content) except asyncio.CancelledError: raise except errors.TimeoutError as error: logger.error("%s %i - in search_task : TimeoutError : %s", self.name, self._episode, error) await self._close_page(page) # logger.debug("%s %i - in search_task: send = None", self.name, # self._episode) #await self.url.put(None) raise asyncio.CancelledError("TimeoutError : {}".format(error)) except errors.NetworkError as error: logger.error("%s %i - in search_task : NetworkError : %s", self.name, self._episode, error) await self._close_page(page) # logger.debug("%s %i - in search_task: send = None", self.name, # self._episode) #await self.url.put(None) raise asyncio.CancelledError("NetworkError : {}".format(error)) except errors.PageError as error: logger.error("%s %i - in search_task : PageError : %s", self.name, self._episode, error) await self._close_page(page) # logger.debug("%s %i - in search_task: send = None", self.name, # self._episode) #await self.url.put(None) raise asyncio.CancelledError("PageError : {}".format(error)) except Exception as error: logger.error("%s %i - in search_task : Exception : %s", self.name, self._episode, error) await self._close_page(page) # logger.debug("%s %i - in search_task: send = None", self.name, # self._episode) #await self.url.put(None) raise asyncio.CancelledError("Exception : {}".format(error)) # except: # await self._screenshot(page, "_except") # await self.url.put(None) # logger.debug("raise error '%s'", search.data) # await self._close_page(page) # raise asyncio.CancelledError # print(content) soup = Soup(content, features="lxml") ahref = soup.find_all("a", href=True) logger.debug("%s %i - in search_task: ahref = '%s'", self.name, self._episode, clean_str(str(ahref))) logger.info("%s %i - in search_task: search episode %s of %s", self.name, self._episode, self.episode, " ".join(self.filters_and)) urls = list() for data in ahref: if self.as_all_ellements(self.filters_and, data.get_text()): if self.as_one_ellement(self.filters_or, data.get_text()): #self.torrent_page_url.append(full_url(page, data["href"])) url = full_url(page, data["href"]) logger.debug( "%s %i - in search_task: append url = '%s'", self.name, self._episode, url) #await self.url.put(url) urls.append((site, url)) await self._close_page(page) # logger.debug("%s %i - in search_task: send = None", self.name, # self._episode) #await self.url.put(None) return urls
def game_soup(gameId): match_url = 'http://www.espn.com/nfl/game?gameId=' + str(gameId) u_client = ureq(match_url) page_html = u_client.read() u_client.close() return Soup(page_html, 'html.parser')
]) # ----------- copy images over: print destImagesPath if os.path.exists(sourceImagesPath): copytree(sourceImagesPath, destImagesPath) chapterDict = {} chapterDict['path'] = chapter chapterDict['href'] = chapter + ".html" # ----------- now let's alter the HTML that's produced: if os.path.exists(destChapterPath): soup = Soup(open(destChapterPath).read()) # --- grab the title from h1 h1s = soup.find_all("h1") if (len(h1s) > 0): chapterDict['title'] = h1s[0].getText() else: chapterDict['title'] = "needs h1" chapterDict['chapterListName'] = chapter chapterDict['sections'] = [] chapterDict['destChapterPath'] = destChapterPath # --- Grab all the h2 (we call them sections) h2s = soup.find_all("h2")
def parse_subway_status(data): """ Returns a nested dictionary of MTA subway line statues given an XML response. """ # Set all line statuses to base status. line_status = { line: { "state": None, "direction_0_state": None, "direction_1_state": None, "delays_description": None, "service_change_description": None, "planned_work_description": None } for line in SUBWAY_LINES } # Parse MTA lines from XML soup = Soup(data.text, "xml") # Iterate over line lookup and parse status. for line in SUBWAY_LINES: # Rename the MTA alias for Shuttle (S). line_alias = "H" if line == "S" else str(line) # Search for line name in affected lines XML. line_re = re.compile("NYCT_" + line_alias + "$") hits = [ _ for _ in soup.find_all("Affects") if _.findChildren("LineRef", text=line_re) ] # Set line status to Good Service if no status. if not hits: line_status[line].update({ "state": "Good Service", "direction_0_state": "Good Service", "direction_1_state": "Good Service", }) continue # Parse all subway line situations that contain # affected line. situations = [_.find_parent("PtSituationElement") for _ in hits] # Parse subway line state. statuses = [_.ReasonName.text for _ in situations] # Look for overlap of statuses with known states # in STATE_PRIORITY dictionary matches = set(STATE_PRIORITY.keys()).intersection(set(statuses)) # Set the current state using the minimum of the # ordinal STATE_PRIORITY dictionary, or unknown if # state does not exist in dictionary. if len(matches) > 0: line_status[line]["state"] = min( {_: STATE_PRIORITY[_] for _ in matches}, key=STATE_PRIORITY.get) else: line_status[line]["state"] = "Unknown" # Determine state for each direction on the line. dir_states = { "0": ["Good Service"], "1": ["Good Service"], } for sit in situations: # Find affected line directions. directions = [ _.DirectionRef.text for _ in sit.find_all("AffectedVehicleJourney") if _.findChildren("LineRef", text=line_re) ] # Add states to line direction. for dct in directions: dir_states[dct].append(sit.ReasonName.text) # Set the direction states using STATE_PRIORITY. for dct in dir_states: matches = set(STATE_PRIORITY.keys()).intersection( set(dir_states[dct])) direction = "direction_{}_state".format(dct) if len(matches) > 0: line_status[line][direction] = min( {_: STATE_PRIORITY[_] for _ in matches}, key=STATE_PRIORITY.get) else: line_status[line][direction] = "Unknown" # Set line status descriptions. for status in STATE_PRIORITY: desc_key = (status.lower().replace(" ", "_") + "_description") descs = [ _.find("Description").text for _ in situations if _.find("ReasonName").text == status ] if descs: line_status[line][desc_key] = (descs if len(descs) > 1 else descs[0]) return line_status
def soupify_xml(filename: str) -> Soup: with open(filename, "r") as f: soup = Soup(f, "xml") return soup
def test_zero_results(app_client, path): response = app_client.get(path) soup = Soup(response.text, "html.parser") assert 0 == len(soup.select("table")) assert 1 == len(soup.select("p.zero-results"))
'non_food_allergies', 'specialized_diet', 'vioscreen_activity_level', 'vioscreen_age', 'vioscreen_bcodeid', 'vioscreen_bmi', 'vioscreen_dob', 'vioscreen_eer', 'vioscreen_email', 'vioscreen_finished', 'vioscreen_gender', 'vioscreen_height', 'vioscreen_nutrient_recommendation', 'vioscreen_procdate', 'vioscreen_protocol', 'vioscreen_recno', 'vioscreen_scf', 'vioscreen_scfv', 'vioscreen_srvid', 'vioscreen_started', 'vioscreen_subject_id', 'vioscreen_time', 'vioscreen_user_id', 'vioscreen_visit', 'vioscreen_weight' ] for filename in os.listdir(folders[3]): fullname = os.path.join(folders[3], filename) infile = open(fullname, "r") contents = infile.read() soup = Soup(contents, 'xml') title = soup.find('TITLE') if title is not None: primary_ID = soup.find('PRIMARY_ID') external_ID = soup.find('EXTERNAL_ID') sample_ID = title taxon_ID = soup.find('TAXON_ID') science_name = soup.find('SCIENTIFIC_NAME') tags = soup.findAll('TAG') values = soup.findAll('VALUE') infile.close() tags = [i.get_text() for i in tags] tags = [x for x in tags if x not in notoktags] values = [i.get_text() for i in values] values_corrected = [] for i in values:
def test_definition_sql(path, expected_definition_sql, app_client): response = app_client.get(path) pre = Soup(response.body, "html.parser").select_one("pre.wrapped-sql") assert expected_definition_sql == pre.string
def processThesis(thesis, fileObject): global URL_WITH_ADDITIONAL_INFO, globalRequest thesisInfo = thesis.find_all("i") if (len(thesisInfo) > 7): possibleName = thesisInfo[0] nameConvert = '' for letter in possibleName: nameConvert += str(letter) if ("roz." in nameConvert): print("Roz. problem") del thesisInfo[0] if (len(thesisInfo) != 7): print("Bad number of data len: ", len(thesisInfo)) year = extractYearFromList(thesisInfo) successRate = extractSuccessRateFromList(thesisInfo) extraLink = extractLink(thesis) #print("url: " + URL_WITH_ADDITIONAL_INFO + extraLink) requestForPage = handleRequestGET(URL_WITH_ADDITIONAL_INFO + extraLink, 3) if requestForPage.status_code != 200: print("Cannot handle request with status code: ", requestForPage.status_code) time.sleep(10) fileObject.close() return soupWebPage = Soup(requestForPage.content, "html5lib") authorName = soupWebPage.h3.b.string nameOfThesis = soupWebPage.find_all('h2')[1].string subjectOfStudy = soupWebPage.find('div', { 'class': 'oddil' }).em.string.split('/')[1] typeOfThesis = globalRequest['TIT'] nameOfSupervisor = extractSupervisor(soupWebPage) nameOfOponent = extractOponent(soupWebPage) authorName = authorName.replace(',', ' ') nameOfThesis = nameOfThesis.replace(',', ' ') subjectOfStudy = subjectOfStudy.replace(',', ' ') nameOfSupervisor = nameOfSupervisor.replace(',', ' ') nameOfOponent = nameOfOponent.replace(',', ' ') ''' print("author: " + authorName) print("year: " + year) print("type of thesis:" + typeOfThesis) print("subjectOfStudy: " + subjectOfStudy) print("nameOfThesis: " + nameOfThesis) print("succes: " + successRate) print("supervisor: " + nameOfSupervisor) print("oponent: " + nameOfOponent) ''' csvRow = authorName + ',' + nameOfThesis + ',' + subjectOfStudy + ',' + typeOfThesis + ',' + year + ',' + successRate + ',' + nameOfSupervisor + ',' + nameOfOponent + '\n' ''' print("author: "+authorName+"; year: "+year+"; supervisor: "+nameOfSupervisor+"; oponent: "+nameOfOponent) print("nameOfThesis: "+nameOfThesis+"; subjectOfStudy: "+subjectOfStudy) print("Succes: "+successRate) ''' if len(csvRow.split(',')) == 8: fileObject.write(csvRow) print("Succesful write to CSV file") else: print("Bad CSV format!!!")
def test_sort_links(app_client): response = app_client.get("/fixtures/sortable?_sort=sortable") assert response.status == 200 ths = Soup(response.body, "html.parser").findAll("th") attrs_and_link_attrs = [{ "attrs": th.attrs, "a_href": (th.find("a")["href"].split("/")[-1] if th.find("a") else None), } for th in ths] assert [ { "attrs": { "class": ["col-Link"], "scope": "col" }, "a_href": None }, { "attrs": { "class": ["col-pk1"], "scope": "col" }, "a_href": None }, { "attrs": { "class": ["col-pk2"], "scope": "col" }, "a_href": None }, { "attrs": { "class": ["col-content"], "scope": "col" }, "a_href": None }, { "attrs": { "class": ["col-sortable"], "scope": "col" }, "a_href": "sortable?_sort_desc=sortable", }, { "attrs": { "class": ["col-sortable_with_nulls"], "scope": "col" }, "a_href": "sortable?_sort=sortable_with_nulls", }, { "attrs": { "class": ["col-sortable_with_nulls_2"], "scope": "col" }, "a_href": "sortable?_sort=sortable_with_nulls_2", }, { "attrs": { "class": ["col-text"], "scope": "col" }, "a_href": "sortable?_sort=text", }, ] == attrs_and_link_attrs
from requests import get url = get( "https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating" ) request = url.text from bs4 import BeautifulSoup as Soup soup_data = Soup(request, 'html.parser') #print(soup_data.title.text) movies = soup_data.findAll('div', {"class": 'lister-item mode-advanced'}) frist_movie = movies[0] Name = [] Position = [] Year = [] Rating = [] Ure = [] #x=(frist_movie.find('div',{"class":"lister-itemfrist_-image float-left"}).find('a').get("href")) #div=frist_movie.find('div',{"class":"lister-item-image float-left"}) # moovie_link=div.find("a").get("href") for i in movies: Name.append(i.h3.a.text) Position.append( i.find('span', { "class": "lister-item-index unbold text-primary" }).text[:1]) Year.append( i.find('span', { "class": "lister-item-year text-muted unbold" }).text[1:5]) Rating.append( i.find('div', {"class": "inline-block ratings-imdb-rating"})['data-value'])
def test_facet_display(app_client): response = app_client.get( "/fixtures/facetable?_facet=planet_int&_facet=city_id&_facet=on_earth") assert response.status == 200 soup = Soup(response.body, "html.parser") divs = soup.find("div", {"class": "facet-results"}).findAll("div") actual = [] for div in divs: actual.append({ "name": div.find("strong").text, "items": [{ "name": a.text, "qs": a["href"].split("?")[-1], "count": int(str(a.parent).split("</a>")[1].split("<")[0]), } for a in div.find("ul").findAll("a")], }) assert [ { "name": "city_id", "items": [ { "name": "San Francisco", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=1", "count": 6, }, { "name": "Los Angeles", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=2", "count": 4, }, { "name": "Detroit", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=3", "count": 4, }, { "name": "Memnonia", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=4", "count": 1, }, ], }, { "name": "planet_int", "items": [ { "name": "1", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&planet_int=1", "count": 14, }, { "name": "2", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&planet_int=2", "count": 1, }, ], }, { "name": "on_earth", "items": [ { "name": "1", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&on_earth=1", "count": 14, }, { "name": "0", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&on_earth=0", "count": 1, }, ], }, ] == actual
def __init__(self, conf_file=""): self.db_conn_dict = {} self.queries_dict = {} self.users_dict = {} self.chartboards = []; self.appName="" handler = open(conf_file).read() soup = Soup(handler,'xml') ''' load env variable from config file ''' env = soup.find('env') orahome = env.find('ora_home').string ldlibpath = env.find('ld_library_path').string self.appName= env.find('appName').string os.putenv('ORACLE_HOME', orahome) os.putenv('LD_LIBRARY_PATH',ldlibpath) dss = soup.find('datasources') ''' load datasources from config file ''' for ds in dss.findAll('datasource'): dbtype = ds.find("type").string name= ds.find("name").string host= ds.find("host").string if ds.find("host")!=None else ""; driver=ds.find("driver").string if ds.find("driver")!=None else ""; port= ds.find("port").string if ds.find("port")!=None else ""; user= ds.find("user").string if ds.find("user")!=None else ""; password=ds.find("password").string if ds.find("password")!=None else ""; service= ds.find("service").string if ds.find("service")!=None else ""; sid= ds.find("sid").string if ds.find("sid")!=None else ""; self.db_conn_dict[name] = {'type':dbtype,'host':host,'driver':driver,'port':port,'user':user,'password':password,'service':service,'sid':sid }; ''' load queries from xml file ''' queries = soup.find('queries') for q in queries.findAll('query'): #add Query to queries dict query_ = q.find('sql').string params=() name_ = q.attrs['name'] target_ = q.attrs['ds'] paramNum = q.attrs['params'] if 'params' in q.attrs else None _desc = q.attrs['description'] if 'description' in q.attrs else "" select_number = int(q.attrs['selects']) if 'selects' in q.attrs else 1 if paramNum: k=1; parameters = q.find('params') for p in parameters.findAll('param'): _p_name=p.attrs['name'] _p_type=p.attrs['paramtype'] _p_desc=p.attrs['description'] _combo_vals =p.attrs['vals'] if 'vals' in p.attrs else '' toAdd=Param(k,_p_name,_p_type,_combo_vals.split(','),_p_desc ) params=params+(toAdd,) k=k+1 #print toAdd,'from xml: ',_combo_vals qry = Query(query = query_,parmap=params,target = target_,parnum=paramNum,name = name_,selectNumber=select_number,description=_desc) #print qry self.queries_dict[name_] = qry ''' load users from xml file ''' users = soup.find('users') for u in users.findAll('user'): username = u.find("username").string pswd = u.find("password").string dslist = (); datasourceslist = u.findAll("ds") for d in datasourceslist: dslist = dslist+(d.string,) user = User(username,pswd,dslist) self.users_dict[username] = user ''' load chartboards from xml file ''' chartsb=soup.find('chartboards') for c in chartsb.findAll('chartboard'): toadd={} _user = c.find("user").string _type=c.find("type").string _querydata=c.find("querydata").string _title = c.find("title").string toadd['user']=_user toadd['type']=_type toadd['querydata']=_querydata toadd['title']=_title toadd['inverted']=True if 'inverted' in c.attrs else False self.chartboards.append(toadd)
def test_database_download_disallowed_for_mutable(app_client): response = app_client.get("/fixtures") soup = Soup(response.body, "html.parser") assert 0 == len(soup.findAll("a", {"href": re.compile(r"\.db$")})) assert 403 == app_client.get("/fixtures.db").status
import scraperwiki import urllib from bs4 import BeautifulSoup as Soup # stap 1: urls samenstellen base_url = "http://evenementen.uitslagen.nl/2013/marathonrotterdam/details.php?s=" end_url = "&o=1&t=nl" for num in range(1, 10): baseplusnr = base_url + str(num) url = baseplusnr + end_url # stap 2: de urls openen soup = Soup(urllib.urlopen(url)) # stap 3: de 'niet gevonden' pagina's buiten beschouwing laten split = soup.find("b", style="color:red") if split is None: col = soup.findAll('td') # stap 4: aanwijzen welke table data we willen hebben startnr = col[0].string.replace("Startnummer", "") naam = col[4].string woonplaats = col[6].string afstand = col[8].string cat = col[10].string totplaats = col[12].string
def test_allow_sql_on(app_client): response = app_client.get("/fixtures") soup = Soup(response.body, "html.parser") assert len(soup.findAll("textarea", {"name": "sql"})) response = app_client.get("/fixtures/sortable") assert b"View and edit SQL" in response.body
import bs4 from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as Soup my_url= "https://www.eventbrite.com/d/india--bengaluru/events/" uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = Soup(page_html, "html.parser") containers = page_soup.findAll("div",{"class":"eds-media-card-content__content__principal"}) filename= "events.csv" f = open(filename,"w") headers = "event, day, date, time, location, price\n" f.write(headers) for container in containers : title_container= container.a.div.div event = title_container.text another_container = container.findAll("div",{"class":"eds-media-card-content__sub-content"}) date = another_container[0].div.text place_container= container.findAll("div",{"class":"eds-media-card-content__sub-content-cropped"})
def _get_soup(html=PYCON_HTML): return Soup(html.read_text(encoding="utf-8"), "html.parser")
'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Greenwood%20Village%2C%20CO%22}]', 'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Lakewood%2C%20CO%22}]', 'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Castle%20Rock%2C%20CO%22}]', 'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Littleton%2C%20CO%22}]', 'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Louisville%2C%20CO%22}]', 'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Loveland%2C%20CO%22}]', 'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22N%252E%20Lakewood%2C%20CO%22}]', 'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Northglenn%2C%20CO%22}]', 'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Westminster%2C%20CO%22}]' ] pd.set_option('max_colwidth', 500) # remove column limits or info will be lost df = pd.DataFrame() # Create a new data frame for a in base_url: target = Soup(urllib.urlopen(a), "lxml") # This part of the code returns specific job info desired targetElements = target.findAll('li') for elem in targetElements: comp_name = "Lowes" try: job_title = elem.find('h2').getText() except AttributeError: job_title = "null" home_url = "https://jobs.lowes.com/" try: job_link = "%s%s" % (home_url, elem.find('a').get('href')) except AttributeError: job_link = "null"
def get_data(): if request.method == 'POST': for filename in glob.glob("templates/searched*"): os.remove(filename) searched_tweet = request.form['search'] location = request.form['location'] result_csv = requestResults(searched_tweet) f = open('templates/trying_local.html').read() soup = Soup(f, features="html.parser") p = soup.find("p", {"class": "searched_for"}) paginate = soup.find("div", {"class": "pagination"}) if result_csv.empty: p.append("You searched for: " + searched_tweet + ". This is a Non-Donation request.") else: # Check for tweets at the location given. If location not given, then show results for the world (only 30 tweets per page). if location != "": show_results = result_csv[ result_csv['Location'].str.contains(location.upper()) | result_csv['Location'].str.contains(location.lower())] location_results = show_results # If no tweets are present at searched location if len(show_results) == 0: show_results = result_csv[:30] p.append("You searched for: " + searched_tweet + " at " + location + ". Found 0 results. Displaying " + str(len(result_csv)) + " results for other locations.") location = "" else: result_csv = result_csv[ ~result_csv['Location'].str.contains(location.upper()) & ~result_csv['Location'].str.contains(location.lower())] # Showing only top 15 tweets of searched location if len(show_results) > 15: show_results = show_results[:13] p.append("You searched for: " + searched_tweet + " at " + location + ". Found " + str(len(show_results)) + " results.") else: show_results = result_csv[:30] p.append("You searched for: " + searched_tweet + ". Found " + str(len(result_csv)) + " results.") if location == "": n = len(result_csv) // 30 if (n % 30 != 0): n += 1 else: n = len(result_csv) // 15 if (n % 15 != 0): n += 1 parameters = pd.DataFrame({ 'Location': [location], 'Searched': [searched_tweet], 'Pages': n }) # result_csv['location_searched'] = location # result_csv['searched'] = searched_tweet parameters.to_csv('parameters.csv', index=False) result_csv.to_csv('pagewise_results.csv', index=False) # result_csv = result_csv.drop(['location_searched', 'searched'], axis=1) show_results.reset_index(drop=True, inplace=True) show_results.index += 1 result = Soup(show_results.to_html(), features="html.parser") result.find("tr")['style'] = 'text-align:center;' # Make URLs as hyperlinks count = 0 insert = 3 for td in result.find_all("td"): count += 1 if (count == insert): if (td.text != "Not Available"): a = soup.new_tag("a") a["href"] = td.text a.string = td.text td.string = "" td.append(a) insert += 4 if count == insert - 2: td['style'] = "width:12%;" table = result.find("table") table['border'] = '0' table[ 'style'] = 'position:absolute;top:180px;padding-left:35px;padding-right:35px;text-align:center;' p.insert_after(result) a = soup.find("a", {"id": 1}) if location == "": # n = len(result_csv)//30 if n > 20: paginate[ "style"] = "position:absolute;left:50%;top:190%;width:71%;transform: translate(-50%, -50%); background-color: #525252;background-size: cover;" else: paginate[ "style"] = "position:absolute;left:50%;top:190%;transform: translate(-50%, -50%); background-color: #525252;background-size: cover;" else: # n = len(result_csv)//15 if n > 20: paginate[ "style"] = "position:absolute;left:50%;top:195%;width:71%;transform: translate(-50%, -50%); background-color: #525252;background-size: cover;" else: paginate[ "style"] = "position:absolute;left:50%;top:195%;transform: translate(-50%, -50%); background-color: #525252;background-size: cover;" if (n > 20): n = 20 # if (n%30 != 0): # n += 1 for i in range(n - 1): pages = soup.new_tag("a") pages['class'] = 'inactive' pages['id'] = i + 2 pages['onclick'] = "redirectPage(this.id)" pages.string = str(i + 2) a.insert_after(pages) a = pages if location != "": # Other Location Results p = soup.new_tag("p") p['class'] = "other_results_para" p['style'] = "position: absolute;top:750px;font-weight: bold;" p.string = "Other Location Tweets (Found " + str( len(result_csv)) + " results)" table = soup.find("table", {"class": "dataframe"}) table.insert_after(p) other_results = result_csv[:15] other_results.reset_index(drop=True, inplace=True) other_results.index += 1 result = Soup(other_results.to_html(), features="html.parser") result.find("tr")['style'] = 'text-align:center;' # Make URLs as hyperlinks count = 0 insert = 3 for td in result.find_all("td"): count += 1 if (count == insert): if (td.text != "Not Available"): a = soup.new_tag("a") a["href"] = td.text a.string = td.text td.string = "" td.append(a) insert += 4 if count == insert - 2: td['style'] = "width:12%;" table = result.find("table") table['class'] = 'other_results' table['border'] = '0' table[ 'style'] = 'position:absolute;top:800px;padding-left:35px;padding-right:35px;text-align:center;' p = soup.find("p", {"class": "other_results_para"}) p.insert_after(table) file = open('templates/searched_' + searched_tweet + '_' + location + '.html', "w", encoding="utf-8") file.write(str(soup)) file.close() return render_template('searched_' + searched_tweet + '_' + location + '.html')