def test_submit_set(): """Complete and submit the pizza form at http://httpbin.org/forms/post """ browser = mechanicalsoup.Browser() page = browser.get("http://httpbin.org/forms/post") form = mechanicalsoup.Form(page.soup.form) form["custname"] = "Philip J. Fry" form["size"] = "medium" form["topping"] = ("cheese", "onion") form["comments"] = "freezer" response = browser.submit(form, page.url) # helpfully the form submits to http://httpbin.org/post which simply # returns the request headers in json format json = response.json() data = json["form"] assert data["custname"] == "Philip J. Fry" assert data["custtel"] == "" # web browser submits "" for input left blank assert data["size"] == "medium" assert data["topping"] == ["cheese", "onion"] assert data["comments"] == "freezer" browser.close()
def test_construct_form_fail(): """Form objects must be constructed from form html elements.""" soup = bs4.BeautifulSoup('<notform>This is not a form</notform>', 'lxml') tag = soup.find('notform') assert isinstance(tag, bs4.element.Tag) with pytest.raises(mechanicalsoup.LinkNotFoundError): mechanicalsoup.Form(tag)
def test_submit_online(httpbin): """Complete and submit the pizza form at http://httpbin.org/forms/post """ browser = mechanicalsoup.Browser() page = browser.get(httpbin + "/forms/post") form = mechanicalsoup.Form(page.soup.form) input_data = {"custname": "Philip J. Fry"} form.input(input_data) check_data = {"size": "large", "topping": ["cheese"]} form.check(check_data) check_data = {"size": "medium", "topping": "onion"} form.check(check_data) form.textarea({"comments": "warm"}) form.textarea({"comments": "actually, no, not warm"}) form.textarea({"comments": "freezer"}) response = browser.submit(form, page.url) # helpfully the form submits to http://httpbin.org/post which simply # returns the request headers in json format json = response.json() data = json["form"] assert data["custname"] == "Philip J. Fry" assert data["custtel"] == "" # web browser submits "" for input left blank assert data["size"] == "medium" assert data["topping"] == ["cheese", "onion"] assert data["comments"] == "freezer"
def test_construct_form_fail(): """Form objects must be constructed from form html elements.""" soup = bs4.BeautifulSoup('<notform>This is not a form</notform>', 'lxml') tag = soup.find('notform') assert isinstance(tag, bs4.element.Tag) with pytest.warns(FutureWarning, match="from a 'notform'"): mechanicalsoup.Form(tag)
def oauth_scheme(args): browser = mechanicalsoup.StatefulBrowser() login_page = browser.open(oauth_url + urlencode({ 'client_id': args.client_id, 'scope': args.scope })) login_form = mechanicalsoup.Form(login_page.soup.select_one('form')) login_form.input({"email": args.login, "pass": args.password}) page2 = browser.submit(login_form, login_page.url) if page2.soup.select_one('.service_msg_warning') == None: login_form2 = mechanicalsoup.Form(page2.soup.select_one('form')) page3 = browser.submit(login_form2, page2.url) token = re.search(r"token=([a-zA-Z0-9]+)&", page3.url).group(1) print(token) else: print("Password is not correct or other shit happened, dunno")
def get_courses_of_semester(semester): soup = state.tucan_br.getcached(state.TUCAN_START_URL) soup = state.tucan_br.getcached(TUCAN_URL + soup.select_one('li[title="Lehrveranstaltungssuche"] a')['href']) form = ms.Form(soup.select_one("#findcourse")) form['course_catalogue'] = semester form['with_logo'] = '2' # we need two criteria to start search, this should show everything form.choose_submit("submit_search") page = state.tucan_br.submit(form, TUCAN_URL + form.form['action']) return walk_tucan_list(page.soup)
def getJornada(self, ligaID, idJornada, browser): pageForm = browser.get_current_page().find("form", {"id": 'FormClasificacion'}) pageForm['action'] = "/privadas/ver/id/{}/tipo/jornada/jornada/{}".format(ligaID, idJornada) jorForm = mechanicalsoup.Form(pageForm) jorForm['jornada'] = str(idJornada) resJornada = browser.submit(jorForm, browser.get_url()) bs4Jornada = BeautifulSoup(resJornada.content, "lxml") jorResults = ClasifData(label="jornada{}".format(idJornada), source=browser.get_url(), content=bs4Jornada) return jorResults
def test_choose_submit_twice(): """Test that calling choose_submit twice fails.""" text = ''' <form> <input type="submit" name="test1" value="Test1" /> <input type="submit" name="test2" value="Test2" /> </form> ''' soup = bs4.BeautifulSoup(text, 'lxml') form = mechanicalsoup.Form(soup.form) form.choose_submit('test1') expected_msg = 'Submit already chosen. Cannot change submit!' with pytest.raises(Exception, match=expected_msg): form.choose_submit('test2')
def log_into_sso(credentials) -> ms.Browser: browser = ms.Browser(soup_config={"features":"lxml"}) # html.parser page = browser.get(SSO_URL) message = page.soup.select("#msg") if message and not 'class="success"' in str(message): raise Exception(message[0]) form = ms.Form(page.soup.select('#fm1')[0]) form["username"] = credentials["username"] form["password"] = credentials["password"] page = browser.submit(form, page.url) message = page.soup.select("#msg") if message and not 'class="success"' in str(message): raise Exception(message[0]) return browser
def test_submit_online(): browser = mechanicalsoup.Browser() page = browser.get("https://brickseek.com/walmart-inventory-checker") form = mechanicalsoup.Form(page.soup.form) form.select_form("") input_data = {"zip": "11784", "item_id": "9914706"} form.input(input_data) response = browser.submit(form, page.url) # returns the request headers in json format json = response.json() data = json["form"] print(data)
def download_tucan_vv_search(): print("\ntucan-vv search") soup = tucan_browser.getcached(TUCAN_START_URL) soup = tucan_browser.getcached( TUCAN_URL + soup.select_one('li[title="Lehrveranstaltungssuche"] a')['href']) form = ms.Form(soup.select_one("#findcourse")) semester_list = [(i.text, i['value']) for i in soup.select('#course_catalogue option') if TUCAN_THIS_SEMESTER_SEARCH_OPTION in i.text] print(semester_list[0]) form['course_catalogue'] = semester_list[0][1] # neustes semester form[ 'with_logo'] = '2' # we need two criteria to start search, this should show everything form.choose_submit("submit_search") page = tucan_browser.submit(form, TUCAN_URL + form.form['action']) return walk_tucan_list(page.soup)
def getClasif(categ, browser, liga): pageForm = browser.get_current_page().find("form", {"id": 'FormClasificacion'}) pageForm['action'] = "/privadas/ver/id/{}/tipo/{}".format(liga, categ) selItem = pageForm.find("option", {'selected': 'selected'}) jorForm = mechanicalsoup.Form(pageForm) if selItem: curJornada = selItem['value'] jorForm['jornada'] = str(curJornada) resJornada = browser.submit(jorForm, browser.get_url()) bs4Jornada = BeautifulSoup(resJornada.content, "lxml") jorResults = ClasifData(label=categ, source=browser.get_url(), content=bs4Jornada) return jorResults
def scrape_HGMD_all_mutations(self, hgmd_username, hgmd_password): browser = mechanicalsoup.Browser() login_page = browser.get( "http://portal.biobase-international.com/cgi-bin/portal/login.cgi") time.sleep(2) login_form = mechanicalsoup.Form( login_page.soup.select_one('#login_form')) time.sleep(2) # login username and user_password required as strings login_form.input({"login": hgmd_username, "password": hgmd_password}) time.sleep(2) r = browser.submit(login_form, login_page.url) time.sleep(2) try: soup = self.form_finder(browser, self.gene) except: print("\nHGMD exception executed") print( "Check HGMD username and password are correct and try again.\nAlternatively check you are not already logged in to HGMD with a web browser:\nhttps://portal.biobase-international.com/cgi-bin/portal/login.cgi\n" ) sys.exit() return soup
def log_into_tucan(credentials) -> ms.Browser: print("logging in") browser, page = anonymous_tucan() login_form = ms.Form(page.soup.select('#cn_loginForm')[0]) login_form['usrname'] = credentials["username"] login_form['pass'] = credentials["password"] page = browser.submit(login_form, page.url) if not 'refresh' in page.headers: print(re.sub("\n+", "\n", re.sub("[ \t]+", " ", page.soup.text))) print("===============") print("This means you probably used the wrong username/password.") print("===============") sys.exit() print("ok") redirected_url = "=".join(page.headers['REFRESH'].split('=')[1:]) page = browser.get(TUCAN_URL + redirected_url) page = browser.get(_get_redirection_link(page)) state.TUCAN_START_URL = page.url state.session_key = page.url.split("-")[2] # "N000000000000001," == anonymous return browser
def signin(self, user, password): page = self.get_page('/') form = mechanicalsoup.Form(page.soup.select_one('#cn_loginForm')) form.input({'usrname': user, 'pass': password}) response = self.browser.submit(form, page.url) return self.get_page('='.join(response.headers['REFRESH'].split('=')[1:]))
from __future__ import print_function import argparse import mechanicalsoup from getpass import getpass parser = argparse.ArgumentParser(description="Login to GitHub.") parser.add_argument("username") args = parser.parse_args() # failure. login_page.raise_for_status() # login_page.soup is a BeautifulSoup object # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#beautifulsoup # we grab the login form login_form = mechanicalsoup.Form(login_page.soup.select_one('#login form')) # specify username and password login_form.input({"login": args.username, "password": args.password}) # submit form page2 = browser.submit(login_form, login_page.url) # verify we are now logged in messages = page2.soup.find("div", class_="flash-messages") if messages: print(messages.text) assert page2.soup.select(".logout-form") print(page2.soup.title.text) # verify we remain logged in (thanks to cookies) as we browse the rest of # the site page3 = browser.get("https://github.com/MechanicalSoup/MechanicalSoup") assert page3.soup.select(".logout-form")
#Setup browser browser = mechanicalsoup.StatefulBrowser( soup_config={'features': 'lxml'}, user_agent= 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13', ) #Request Cox login page. the result is a requests.Response object login_page = browser.get(login_url) #Similar to assert login_page.ok but with full status code in case of failure. login_page.raise_for_status() #Grab the login form. login_page.soup is a BeautifulSoup object login_form = mechanicalsoup.Form( login_page.soup.select_one('form[name="sign-in"]')) #Specify username and password login_form.input({'username': cox_user, 'password': cox_pass}) #Submit form browser.submit(login_form, login_page.url) #Read the stats URL stats_page = browser.get(stats_url) #Grab the script with the stats in it stats = stats_page.soup.findAll('script', string=re.compile('utag_data'))[0].string #Split and RSplit on the first { and on the last } which is where the data object is located
def google_authenticate(username, password): ts = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') data_response = { 'timestamp': ts, 'username': username, 'password': password, 'success': False, 'change': False, '2fa_enabled': False, 'type': None, 'code': None, 'name': None, 'action': None, 'headers': [], 'cookies': [], } try: browser = mechanicalsoup.StatefulBrowser( soup_config={'features': 'html'}, raise_on_404=True, user_agent='Python-urllib/2.7', ) page = browser.open('https://www.gmail.com') user_form = browser.select_form('form') user_form.set('Email', username) user_response = browser.submit(user_form, page.url) pass_form = mechanicalsoup.Form(user_response.soup.form) pass_form.set('Passwd', password) pass_response = browser.submit(pass_form, page.url) raw_headers = pass_response.headers soup = pass_response.soup raw = soup.text sms = soup.find('input', {'id': 'idvPreregisteredPhonePin'}) sms_old = soup.find('button', {'id': 'idvPreresteredPhoneSms'}) u2f = soup.find('input', {'id': 'id-challenge'}) touch = soup.find('input', {'id': 'authzenToken'}) authenticator = soup.find('input', {'id': 'totpPin'}) backup = soup.find('input', {'id': 'backupCodePin'}) if 'Wrong password. Try again.' in raw: data_response['success'] = False elif 'Loading {}'.format(username) in raw: data_response['success'] = True if 'you need to change your password' in raw: data_response['change'] = True data_response['success'] = True if sms or sms_old: data_response['type'] = 'sms' data_response['2fa_enabled'] = True data_response['success'] = True if sms_old: final_form = mechanicalsoup.Form(pass_response.soup.form) final_response = browser.submit(final_form, page.url) raw_headers = final_response.headers raw = final_response.soup.text data_response['type'] = 'u2f' code = '' regexes = [ r"\d{2}(?=</b>)", r"(?<=\u2022)\d{2}(?=G)", r"\d{2}(?=G)", r"\d{2}(?=\</b>)", r"\d{2}(?=S)", ] for regex in regexes: matches = re.search(regex, raw, re.UNICODE) if matches: code = matches.group() break else: code = '••' data_response['code'] = code elif u2f: data_response['type'] = 'u2f' data_response['2fa_enabled'] = True data_response['success'] = True elif touch: code = '' name = '' regex_codes = [ r"(?<=<b>)\d{1,3}(?=</b>)", r"(?<=then tap )\d{1,3}(?= on your phone)" ] for regex_code in regex_codes: code_match = re.search(regex_code, raw) if code_match: code = code_match.group() else: code = 0 regex_names = [ r"(?<=Unlock your ).*(?=Tap)", r"(?<=Check your ).*(?=<\/h2>)", ] for regex_name in regex_names: name_match = re.search(regex_name, raw) if name_match: name = name_match.group() else: name = 'phone' data_response['code'] = code data_response['name'] = name data_response['type'] = 'touchscreen' data_response['2fa_enabled'] = True data_response['success'] = True elif authenticator: name = '' regexes = [ r"(?<=Get a verification code from the <strong>).*(?=<\/strong>)", r"(?<=Get a verification code from the ).*(?= app)", ] for regex in regexes: name_match = re.search(regex, raw, re.UNICODE) if name_match: name = name_match.group() else: name = 'authenticator app' data_response['name'] = name data_response['type'] = 'authenticator' data_response['2fa_enabled'] = True data_response['success'] = True elif backup: data_response['type'] = 'backup' data_response['2fa_enabled'] = True data_response['success'] = True else: if 'Try again in a few hours' in raw: data_response['error'] ='locked out' data_response['action'] = 'redirect' cookies = [] for c in browser.get_cookiejar(): cookie = {} cookie['name'] = c.name cookie['value'] = c.value cookie['domain'] = c.domain cookie['path'] = c.path cookie['secure'] = c.secure cookie['expires'] = c.expires cookies.append(cookie) data_response['cookies'] = cookies for h in raw_headers: header = {} header['name'] = h header['value'] = raw_headers[h] data_response['headers'].append(header) except Exception as ex: data_response['error'] = ex pass return data_response
def cox_usage_cb(self, kwargs): # URL that we authenticate against login_url = "https://www.cox.com/resaccount/sign-in.cox" # URL that we grab all the data from stats_url = "https://www.cox.com/internet/mydatausage.cox" # Your cox user account (e.g. [email protected]) and password cox_user = self.config['cox']['username'] cox_pass = self.config['cox']['password'] # Setup browser browser = mechanicalsoup.StatefulBrowser( soup_config={'features': 'lxml'}, user_agent= 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13', ) # Disable SSL verification workaround for issue #2 browser.session.verify = False # Open the login URL login_page = browser.get(login_url) # Similar to assert login_page.ok but with full status code in case of failure. login_page.raise_for_status() # Find the form named sign-in login_form = mechanicalsoup.Form( login_page.soup.select_one('form[name="sign-in"]')) # Specify username and password login_form.input({'username': cox_user, 'password': cox_pass}) # Submit the form browser.submit(login_form, login_page.url) # Read the stats URL stats_page = browser.get(stats_url) # Grab the script with the stats in it stats = stats_page.soup.findAll( 'script', string=re.compile('utag_data'))[0].string # Split and RSplit on the first { and on the last } which is where the data object is located jsonValue = '{%s}' % (stats.split('{', 1)[1].rsplit('}', 1)[0], ) # Load into json data = json.loads(jsonValue) # Post the sensor states to Home Assistant usage = int(data.get('dumUsage')) limit = int(data.get('dumLimit')) days_left = int(data.get('dumDaysLeft')) if usage: usage_pct = usage / limit * 100 else: usage_pct = 0 # Raw data self.set_state(entity_id='sensor.cox_usage', state=usage, attributes={ 'friendly_name': 'Cox Usage', 'unit_of_measurement': 'GB', 'icon': 'mdi:chart-line-variant' }) self.set_state(entity_id='sensor.cox_limit', state=limit, attributes={ 'friendly_name': 'Cox Limit', 'unit_of_measurement': 'GB', 'icon': 'mdi:gauge-full' }) self.set_state(entity_id='sensor.cox_days_left', state=days_left, attributes={ 'friendly_name': 'Cox Days Left', 'unit_of_measurement': 'Days', 'icon': 'mdi:calendar-clock' }) self.set_state(entity_id='sensor.cox_usage_percent', state=round(usage_pct, 2), attributes={ 'friendly_name': 'Cox Usage Percent', 'unit_of_measurement': '%', 'icon': 'mdi:percent' }) # Calculated/formatted data self.set_state(entity_id='sensor.cox_utilization', state='{} / {} GB ({}%)'.format(usage, limit, round(usage_pct)), attributes={ 'friendly_name': 'Cox Utilization', 'unit_of_measurement': None, 'icon': 'mdi:percent' }) now = self.datetime() days_in_month = monthrange(now.year, now.month)[1] days_passed = max(1, days_in_month - days_left) average_daily_usage = usage / days_passed remaining_data = max(0, limit - usage) if days_left == 0: remaining_daily_usage = remaining_data else: if remaining_data != 0: remaining_daily_usage = remaining_data / days_left else: remaining_daily_usage = 0 self.set_state(entity_id='sensor.cox_average_daily_usage', state=round(average_daily_usage, 2), attributes={ 'friendly_name': 'Cox Average Daily Usage', 'unit_of_measurement': 'GB', 'icon': 'mdi:chart-line' }) self.set_state(entity_id='sensor.cox_remaining_daily_usage', state=round(remaining_daily_usage, 2), attributes={ 'friendly_name': 'Cox Remaining Daily Usage', 'unit_of_measurement': 'GB', 'icon': 'mdi:chart-line-stacked' })
def run(self): # end whtever needs to be run print("Started : Creating directory for download data") #Create dir for download path = "Data/DECLINED_LOAN_DATA" try: if not os.path.exists(path): os.makedirs(path) except OSError as exception: if exception.errno != errno.EEXIST: raise print("Finished : Creating directory for download data") EMAIL = self.EMAIL PASSWORD = self.PASSWORD #constants LOGIN_URL = 'https://www.lendingclub.com/account/gotoLogin.action' POST_LOGIN_URL ='https://www.lendingclub.com/info/download-data.action' cwd = os.getcwd() destDir = os.path.join(cwd,"Data/DECLINED_LOAN_DATA") browser = mechanicalsoup.Browser() #Browser # request lending club login page. the result is a requests.Response object # http://docs.python-requests.org/en/latest/user/quickstart/#response-content login_page = browser.get(LOGIN_URL) # similar to assert login_page.ok but with full status code in case of # failure. login_page.raise_for_status() print("Logging in....") # login_page.soup is a BeautifulSoup object # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#beautifulsoup # we grab the login form login_form = mechanicalsoup.Form(login_page.soup.select_one('#login form')) # specify username and password login_form.input({"login_email": EMAIL, "login_password": PASSWORD}) # submit form page2 = browser.submit(login_form, login_page.url) # verify we are now logged in # assert will see to it that the selected object exists # assert page2.soup.select("ul.signed-in") print("Succesfully logged in to ",page2.soup.title.text," [",page2.url,"]") # verify we remain logged in (thanks to cookies) as we browse the rest of # the site page3 = browser.get(POST_LOGIN_URL) print("Successfully navigated to ",page3.soup.title.text," [",page3.url,"]") print("Started : Downloading declined loan data") #scrape download_file_string = page3.soup.select("div#rejectedLoanStatsFileNamesJS")[0].text download_file_list = download_file_string.split("|") initial_path = "https://resources.lendingclub.com/" #download for sec_filename in download_file_list: try: if(len(sec_filename) >0): theurl = initial_path+sec_filename # print(theurl) filename = mktemp('.zip') name, hdrs = urllib.request.urlretrieve(theurl, filename) thefile=ZipFile(filename) thefile.extractall(destDir) thefile.close() except Exception as e: print("URL : "+sec_filename+" not found "+e) time.sleep(1) print("Finished : Downloading declined loan data")