def run(self): # Open landing page self.r = self.br.open("https://" + self.args.host) while True: action = self.next_action() if action == "tncc": self.action_tncc() elif action == "login": self.action_login() elif action == "key": self.action_key() elif action == "continue": # Say what? The Juniper VPN has HTML syntax errors that keep the mechanize # parser from being able to properly parse the html # So we pull the HTML, fix the one critical error, # and recreate the request update_response = self.br.response() html = update_response.get_data().replace( '<td><input id="postfixSID_1" type="checkbox" onclick="checkSelected()", name="postfixSID"', '<td><input id="postfixSID_1" type="checkbox" onclick="checkSelected()" name="postfixSID"', ) headers = re.findall(r"(?P<name>.*?): (?P<value>.*?)\r\n", str(update_response.info())) response = mechanize.make_response( html, headers, update_response.geturl(), update_response.code, update_response.msg ) self.br.set_response(response) self.action_continue() elif action == "connect": self.action_connect() self.last_action = action
def perform_kindle_login(browser, email, password): """ Log in to Amazon Kindle using the provided email and passwords and exit if login fails """ # Login to Amazon browser.open(AMAZON_LOGIN_URL) bugged_response = browser.response().get_data() doctype_stripped = re.sub('<!DOCTYPE[^>]*>','', bugged_response) incorrect_backslashes_stripped = re.sub('\\\\', '', doctype_stripped) correct_response = mechanize.make_response(incorrect_backslashes_stripped, [("Content-Type", "text/html")], AMAZON_LOGIN_URL, 200, "OK") browser.set_response(correct_response) browser.select_form(name="signIn") browser["email"] = email browser["password"] = password login_response = browser.submit() if login_response.code >= 400: print "Error: Failed to log in to " + AMAZON_LOGIN_URL exit(1) login_html = strip_invalid_html(login_response.get_data()) soup = BeautifulSoup(login_html) # Amazon doesn't use response codes or data to indicate errors... it uses HTML, built on the backend :| if len(soup.select("div.message.error")) != 0: print "Error: Failed to log in to " + AMAZON_LOGIN_URL exit(1)
def action_continue(self): # The Juniper VPN has HTML syntax errors that keep the # mechanize parser from being able to properly parse the html # So we pull the HTML, fix the one critical error, and # recreate the request update_response = self.br.response() html = update_response.get_data().replace( '<td><input id="postfixSID_1" type="checkbox" ' 'onclick="checkSelected()", name="postfixSID"', '<td><input id="postfixSID_1" type="checkbox" ' 'onclick="checkSelected()" name="postfixSID"') headers=re.findall(r"(?P<name>.*?): (?P<value>.*?)\r\n", str(update_response.info())) response = mechanize.make_response(html, headers, update_response.geturl(), update_response.code, update_response.msg) self.r = response self.br.set_response(response) # sometimes only one connection can be active at a time, # force log out other sessions. Find the checkbox, click it # then remove the disable from the submit button self.br.select_form(nr=0) print "Terminating existing session!" check_box_control = self.br.find_control(name='postfixSID') close_selected_session = self.br.find_control(name='btnContinue') # flip the selection on for item in check_box_control.items: item.selected = True # remove disabled from close sessions close_selected_session.disabled = False # now submit correct button self.r = self.br.submit(name='btnContinue')
def perform_kindle_login(browser, email, password): """ Log in to Amazon Kindle using the provided email and passwords and exit if login fails """ # Login to Amazon browser.open(AMAZON_LOGIN_URL) bugged_response = browser.response().get_data() doctype_stripped = re.sub('<!DOCTYPE[^>]*>', '', bugged_response) incorrect_backslashes_stripped = re.sub('\\\\', '', doctype_stripped) correct_response = mechanize.make_response(incorrect_backslashes_stripped, [("Content-Type", "text/html")], AMAZON_LOGIN_URL, 200, "OK") browser.set_response(correct_response) browser.select_form(name="signIn") browser["email"] = email browser["password"] = password login_response = browser.submit() if login_response.code >= 400: print "Error: Failed to log in to " + AMAZON_LOGIN_URL exit(1) login_html = strip_invalid_html(login_response.get_data()) soup = BeautifulSoup(login_html) # Amazon doesn't use response codes or data to indicate errors... it uses HTML, built on the backend :| if len(soup.select("div.message.error")) != 0: print "Error: Failed to log in to " + AMAZON_LOGIN_URL exit(1)
def __init__(self, link): self.link = link # Browser br = mechanize.Browser() # Cookie Jar cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) # Browser options br.set_handle_robots(False) br.set_handle_redirect(True) br.set_handle_refresh(False) br.set_handle_equiv(True) br.set_handle_referer(True) br.addheaders = [('User-agent', 'Firefox')] # Open link response = br.open(self.link) control = None #print "Get code", response.code, "for the first link." if response.code == 200: html = response.get_data().replace("<p/>", "").replace("<p />", "").replace("<hr/>", "") response = mechanize.make_response(html, [("Content-Type", "text/html")], self.link, 200, "OK") br.set_response(response) #print "Tags removed for forms detection." # Select the first (index zero) form br.select_form(nr=0) control = br.form.find_control("idCursus") self.br = br self.control = control
def run(self): # Open landing page self.r = self.br.open('https://' + self.args.host) while True: action = self.next_action() if action == 'tncc': self.action_tncc() elif action == 'login': self.action_login() elif action == 'key': self.action_key() elif action == 'continue': # Say what? The Juniper VPN has HTML syntax errors that keep the mechanize # parser from being able to properly parse the html # So we pull the HTML, fix the one critical error, # and recreate the request update_response = self.br.response() html = update_response.get_data().replace( '<td><input id="postfixSID_1" type="checkbox" onclick="checkSelected()", name="postfixSID"', '<td><input id="postfixSID_1" type="checkbox" onclick="checkSelected()" name="postfixSID"' ) headers = re.findall(r"(?P<name>.*?): (?P<value>.*?)\r\n", str(update_response.info())) response = mechanize.make_response(html, headers, update_response.geturl(), update_response.code, update_response.msg) self.br.set_response(response) self.action_continue() elif action == 'connect': self.action_connect() self.last_action = action
def action_continue(self): # Say what? The Juniper VPN has HTML syntax errors that keep the mechanize # parser from being able to properly parse the html # So we pull the HTML, fix the one critical error, # and recreate the request update_response = self.br.response() html = update_response.get_data().replace('<td><input id="postfixSID_1" type="checkbox" onclick="checkSelected()", name="postfixSID"', '<td><input id="postfixSID_1" type="checkbox" onclick="checkSelected()" name="postfixSID"') headers=re.findall(r"(?P<name>.*?): (?P<value>.*?)\r\n", str(update_response.info())) response = mechanize.make_response(html, headers,update_response.geturl(), update_response.code,update_response.msg) self.r = response self.br.set_response(response) # Yes, I want to terminate the existing connection self.br.select_form(nr=0) print "Terminating existing session!" # sometimes only one connection can be active at a time, # force log out other sessions. Find the checkbox, click it # then remove the disable from the submit button check_box_control = self.br.find_control(name='postfixSID') close_selected_session = self.br.find_control(name='btnContinue') # flip the selection on for item in check_box_control.items: item.selected = True # remove disabled from close sessions (javascript normally does this) close_selected_session.disabled = False # now submit correct button self.r = self.br.submit(name='btnContinue')
def test_new_response(self): br = mechanize.Browser() data = "<html><head><title>Test</title></head><body><p>Hello.</p></body></html>" response = mechanize.make_response(data, [("Content-type", "text/html")], "http://example.com/", 200, "OK") br.set_response(response) self.assertEqual(br.response().get_data(), data)
def soupify_form(soup, form_id): if not soup: soup = BeautifulSoup(br.response().read(), "html.parser") form = soup.find('form', attrs={'id': form_id}) html = str(form) resp = mechanize.make_response(html, [("Content-Type", "text/html")], br.geturl(), 200, "OK") br.set_response(resp)
def test_set_cookie(self): class CookieTestBrowser(TestBrowser): default_features = list( TestBrowser.default_features) + ["_cookies"] # have to be visiting HTTP/HTTPS URL url = "ftp://example.com/" br = CookieTestBrowser() r = mechanize.make_response( "<html><head><title>Title</title></head><body></body></html>", [("content-type", "text/html")], url, 200, "OK", ) br.add_handler(make_mock_handler()([("http_open", r)])) handler = br._ua_handlers["_cookies"] cj = handler.cookiejar self.assertRaises(mechanize.BrowserStateError, br.set_cookie, "foo=bar") self.assertEqual(len(cj), 0) url = "http://example.com/" br = CookieTestBrowser() r = mechanize.make_response( "<html><head><title>Title</title></head><body></body></html>", [("content-type", "text/html")], url, 200, "OK", ) br.add_handler(make_mock_handler()([("http_open", r)])) handler = br._ua_handlers["_cookies"] cj = handler.cookiejar # have to be visiting a URL self.assertRaises(mechanize.BrowserStateError, br.set_cookie, "foo=bar") self.assertEqual(len(cj), 0) # normal case br.open(url) br.set_cookie("foo=bar") self.assertEqual(len(cj), 1) self.assertEqual(cj._cookies["example.com"]["/"]["foo"].value, "bar")
def response_scrapy2mechanize(scrapy_response): url = scrapy_response.url.encode('utf8') code = scrapy_response.status msg = responses.get(code, '') headers = scrapy_response.headers headers = headers_scrapy2mechanize(headers) data = scrapy_response.body mechanize_response = make_response(data, headers, url, code, msg) return mechanize_response
def test_set_cookie(self): class CookieTestBrowser(TestBrowser): default_features = list(TestBrowser.default_features)+["_cookies"] # have to be visiting HTTP/HTTPS URL url = "ftp://example.com/" br = CookieTestBrowser() r = mechanize.make_response( "<html><head><title>Title</title></head><body></body></html>", [("content-type", "text/html")], url, 200, "OK", ) br.add_handler(make_mock_handler()([("http_open", r)])) handler = br._ua_handlers["_cookies"] cj = handler.cookiejar self.assertRaises(mechanize.BrowserStateError, br.set_cookie, "foo=bar") self.assertEqual(len(cj), 0) url = "http://example.com/" br = CookieTestBrowser() r = mechanize.make_response( "<html><head><title>Title</title></head><body></body></html>", [("content-type", "text/html")], url, 200, "OK", ) br.add_handler(make_mock_handler()([("http_open", r)])) handler = br._ua_handlers["_cookies"] cj = handler.cookiejar # have to be visiting a URL self.assertRaises(mechanize.BrowserStateError, br.set_cookie, "foo=bar") self.assertEqual(len(cj), 0) # normal case br.open(url) br.set_cookie("foo=bar") self.assertEqual(len(cj), 1) self.assertEqual(cj._cookies["example.com"]["/"]["foo"].value, "bar")
def test_new_response(self): br = self.make_browser() data = ("<html><head><title>Test</title></head>" "<body><p>Hello.</p></body></html>") response = mechanize.make_response( data, [("Content-type", "text/html")], "http://example.com/", 200, "OK") br.set_response(response) self.assertEqual(br.response().get_data(), data)
def _read_recording(self): dump_path = '%s/%d.json' % (self._recording_path, self._intercept_count) if not os.path.exists(dump_path): return with open(dump_path, 'rb') as f: data = pickle.load(f) if not data: self.set_response(None) return resp = mechanize.make_response(**data) return self.set_response(resp)
def soupify_form(self, soup, form_name): ''' Selecting a form with mechanize sometimes throws the error 'ParseError: OPTION outside of SELECT' Running the form through BeautifulSoup seems to fix the issue ''' if not soup: soup = BeautifulSoup(self.br.response().read()) form = soup.find('form', attrs={'name': form_name}) html = str(form) resp = mechanize.make_response(html, [("Content-Type", "text/html")], self.br.geturl(), 200, "OK") self.br.set_response(resp)
def _make_response_and_submit(self, ctrl_dict: Dict[str, Union[str, List]], html: str) -> bytes: """ Helper to regenerate a response, assign it to the form, and resubmit it. Used for postbacks :param ctrl_dict: Dictionary of page control ids and the values they should be set to :return: """ response = mechanize.make_response(html, [('Content-Type', 'text/html')], self.browser.geturl(), 200, 'OK') self.browser.set_response(response) self.browser.select_form('aspnetForm') self.browser.form.set_all_readonly(False) self._set_controls(ctrl_dict) return self.browser.submit().read()
def _writeable(self): "Make the form writeable" for id in INPUT_IDS: node=self.x.xpath('id("%s")' % id)[0] if node.attrib['type']=='hidden': node.attrib['type']='text' if 'readonly' in node.attrib: del(node.attrib['readonly']) #Save the changes response = make_response( tostring(self.x) , [("Content-Type", "text/html")] , self.geturl() , 200 , "OK" ) self.set_response(response)
def _cleanup_html(self, response): response.seek(0) self._orig_html = response.read() self._url = response.geturl() response.seek(0) self._html = self._orig_html from twill.commands import _options use_tidy = _options.get('use_tidy') if use_tidy: (new_html, errors) = run_tidy(self._html) if new_html: self._html = new_html return mechanize.make_response(self._html, response._headers.items(), response._url, response.code, response.msg)
def transfer(self, from_id, to_id, amount, reason=None): if not self.is_on_page(TransferPage): self.location('/NS_VIRDF') # Need to clean HTML before parse it html = self.response().get_data().replace("<!input", "<input") response = mechanize.make_response( html, [("Content-Type", "text/html")], "https://client.hellobank.fr/NS_VIRDF", 200, "OK") self.set_response(response) accounts = self.page.get_accounts() self.page.transfer(from_id, to_id, amount, reason) if not self.is_on_page(TransferCompletePage): raise TransferError('An error occured during transfer') transfer = Transfer(self.page.get_id()) transfer.amount = amount transfer.origin = accounts[from_id].label transfer.recipient = accounts[to_id].label transfer.date = datetime.now() return transfer
def scrape_job_links(self, url): jobs = [] self.br.open(url) self.br.follow_link(self.br.find_link(text='Search openings')) # # self.br.select_form fails with 'ParseError: OPTION outside of SELECT' # unless we feed this through BeautifulSoup first! # soup = soupify(self.br.response().read()) html = soup.prettify().encode('utf8') resp = mechanize.make_response(html, [("Content-Type", "text/html")], self.br.geturl(), 200, "OK") self.br.set_response(resp) self.br.select_form('frmAgent') self.br.form['keyword'] = 'Coverity' self.br.submit('submit2') pageno = 2 while True: s = soupify(self.br.response().read()) r = re.compile(r'^cim_jobdetail\.asp\?') for a in s.findAll('a', href=r): tr = a.findParent('tr') td = tr.findAll('td') job = {} job['title'] = a.text job['url'] = self.cleaned_url(a['href']) job['location'] = td[3].text jobs.append(job) a = s.find('a', text='%d' % pageno) if not a: break # no more pages pageno += 1 # # Page 2 link: <a href="javascript:LoadPage(51)">2</a> # # function LoadPage(lRecordStart) # { # document.frmMassSelect.recordstart.value = lRecordStart; # document.frmMassSelect.JobInfo.value = gstrJobInfo; # document.frmMassSelect.JobSiteInfo.value = gJobSiteInfo; # document.frmMassSelect.sorting.value = ""; # document.frmMassSelect.submit(); # } # # Clicking on page 2 link Results in FormData: # # JobInfo:%% # recordstart:51 # sorting: # JobSiteInfo: # r = re.compile(r'LoadPage\((\d+)') m = re.search(r, a['href']) self.br.select_form('frmMassSelect') self.br.set_all_readonly(False) self.br.form['recordstart'] = m.group(1) self.br.submit() return jobs
def scrape_beneficiaries(self, state_item): self.br.open(self.url) s = BeautifulSoup(self.br.response().read()) saved_form = s.find('form', id='aspnetForm').prettify() self.br.select_form('aspnetForm') #ERDF: ctl00$ContentPlaceHolder1$ddlPrograme == 1 #ERDF: ctl00$ContentPlaceHolder1$ddlPrograme == 2 self.br.form['ctl00$ContentPlaceHolder1$ddlPrograme'] = [ "1" ] self.br.form['ctl00$ContentPlaceHolder1$ddlYearApproved'] = [ "-1" ] self.br.form['ctl00$ContentPlaceHolder1$ddlTotalGrantApproved'] = [ "any" ] self.br.form['ctl00$ContentPlaceHolder1$ddlyearCompleted'] = [ "-1" ] self.br.form['ctl00$ContentPlaceHolder1$ddlTotalamountPaid'] = [ "any" ] self.br.form.new_control('hidden', 'ctl00$ContentPlaceHolder1$ibtnAdvanceSearch.x', {'value': '58'}) self.br.form.new_control('hidden', 'ctl00$ContentPlaceHolder1$ibtnAdvanceSearch.y', {'value': '14'}) self.br.form['ctl00$ContentPlaceHolder1$ddlcountry'] = [ "-1" ] viewstate = s.select("#__VIEWSTATE")[0]['value'] eventvalidation = s.select("#__EVENTVALIDATION")[0]['value'] self.br.form.set_all_readonly(False) self.br.form['__EVENTVALIDATION'] = eventvalidation self.br.form['__VIEWSTATE'] = viewstate self.br.form['__EVENTARGUMENT'] = 'ctl00$ContentPlaceHolder1$ddlPrograme' self.br.form['__VIEWSTATEGENERATOR'] = 'CA0B0334' self.br.form.fixup() self.br.submit() pageno = 2 with open('final_'+csvfile, 'wb') as fout: writer = csv.writer(fout, dialect="mydialect") writer.writerow(['Organization', 'Project', 'Year_start', 'Total_ammount', 'Year_end', 'Paid_ammount', 'Project_postcode', 'Priority', 'Total_cost', 'Description']) while True: resp = BeautifulSoup(self.br.response().read()) with open("Output"+str(pageno-1)+".txt", "w") as text_file: text_file.write(self.br.response().read()) indexNR = 2 elemNR = 0 while indexNR != 17: if indexNR < 10: counter = '0'+str(indexNR) else: counter = str(indexNR) orgName = resp.find('div',{'id':'ctl00_ContentPlaceHolder1_gvSearchResult_ctl'+counter+'_pnlOrgName'}) projectTitle = resp.find('div',{'id':'ctl00_ContentPlaceHolder1_gvSearchResult_ctl'+counter+'_pnlProjectTitle'}) yearStart = projectTitle.parent.next_sibling totalAmmount = yearStart.next_sibling yearEnd = totalAmmount.next_sibling paidAmmount = resp.find('span',{'id':'ctl00_ContentPlaceHolder1_gvSearchResult_ctl'+counter+'_lblTotalAmountPaid'}) html = resp.find("form", id='aspnetForm').prettify().encode('utf8') view_state = resp.select("#__VIEWSTATE")[0]['value'] event_val = resp.select("#__EVENTVALIDATION")[0]['value'] resp0 = mechanize.make_response(html, [("Content-Type", "text/html")], self.br.geturl(), 200, "OK") self.br.set_response(resp0) self.br.select_form('aspnetForm') self.br.form.set_all_readonly(False) self.br.form['__EVENTTARGET'] = "ctl00$ContentPlaceHolder1$gvSearchResult" self.br.form['__EVENTARGUMENT'] = "Select$"+str(elemNR) self.br.form['__VIEWSTATE'] = view_state self.br.form['__VIEWSTATEGENERATOR'] = "CA0B0334" self.br.form['__VIEWSTATEENCRYPTED'] = "" self.br.form['__EVENTVALIDATION'] = event_val self.br.form['ctl00$ContentPlaceHolder1$txtQuickSearch'] = "" ctl = self.br.form.find_control('ctl00$ibtnApplicationSearch') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ibtnContact') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ibtnEUgrants') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$ibtnQuickSearch') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$ibtnReturnToAdv') self.br.form.controls.remove(ctl) self.br.form.fixup() self.br.submit() resp0_back = BeautifulSoup(self.br.response().read()) table = resp0_back.find('table') table_rows = table.find_all('tr') description = table_rows[3].find_all('td')[1].find('div').text.strip().encode("utf-8") description = description.replace('\n', ' ') description = description.replace('\r', ' ') project_postcode = table_rows[4].find_all('td')[1].find('div').text.strip().encode("utf-8") priority = ''.join(table_rows[6].find_all('td')[1].find('div').find_all('span')[1].next_sibling).strip()[:2].encode("utf-8") total_cost = table_rows[7].find_all('td')[1].find('div').text.strip().encode("utf-8") print "-------------------------" print indexNR print orgName.text.strip() print projectTitle.text.strip() print yearStart.text.strip() print totalAmmount.text.strip() print yearEnd.text.strip() print paidAmmount.text.strip() writer.writerow([orgName.text.strip().encode('ascii', 'ignore'), projectTitle.text.strip().encode('ascii', 'ignore'), yearStart.text.strip().encode('ascii', 'ignore'), totalAmmount.text.strip().encode('ascii', 'ignore'), yearEnd.text.strip().encode('ascii', 'ignore'), paidAmmount.text.strip().encode('ascii', 'ignore'), project_postcode, priority, total_cost, description]) indexNR += 1 elemNR += 1 #TODO: Check for length of list, do no hardcode pagination!!! pageno += 1 if pageno == 174: break # New __VIEWSTATE value view_state = resp.select("#__VIEWSTATE")[0]['value'] event_val = resp.select("#__EVENTVALIDATION")[0]['value'] # Regenerate form for next page html = resp.find("form", id='aspnetForm').prettify().encode('utf8') resp = mechanize.make_response(html, [("Content-Type", "text/html")], self.br.geturl(), 200, "OK") indexNR = 2 self.br.set_response(resp) self.br.select_form('aspnetForm') self.br.form.set_all_readonly(False) self.br.form['__EVENTTARGET'] = "ctl00$ContentPlaceHolder1$gvSearchResult" self.br.form['__EVENTARGUMENT'] = "Page$"+str(pageno-1) self.br.form['__VIEWSTATE'] = view_state self.br.form['__VIEWSTATEGENERATOR'] = "CA0B0334" self.br.form['__VIEWSTATEENCRYPTED'] = "" self.br.form['__EVENTVALIDATION'] = event_val self.br.form['ctl00$ContentPlaceHolder1$txtQuickSearch'] = "" ctl = self.br.form.find_control('ctl00$ibtnApplicationSearch') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ibtnContact') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ibtnEUgrants') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$ibtnQuickSearch') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$ibtnReturnToAdv') self.br.form.controls.remove(ctl) self.br.form.fixup() self.br.submit()
opening[day + '_closing'] = daily_hours[1] store['opening'] = opening m = re.compile(r'new google.maps.LatLng(.*?);').search(store_html) store['location'] = m.group(1).replace('(','').replace(')','') print store scraperwiki.sqlite.save(unique_keys=["name"], data=store) # And finally - international stores. INTERNATIONAL_URL = 'http://corporate.marksandspencer.com/aboutus/where/international_stores' browser = mechanize.Browser() browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] browser.open(INTERNATIONAL_URL) #print browser.response().read() html = browser.response().get_data().replace('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">','').replace('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">','<html>') response = mechanize.make_response( html, [("Content-Type", "text/html")], INTERNATIONAL_URL, 200, "OK") browser.set_response(response) #browser.select_form(nr=0)import json import mechanize import re import scraperwiki import urllib2 import lxml.html POSTCODE_URL = 'https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsonlist&name=uk_postcode_districts&query=select%20postcode_district%20from%20%60uk_postcode_districts%60' BASE_URL = 'http://www.marksandspencer.com/mn/storeLocator?searchCriteria=findByAddress&jsEnabled=true&town=&postCode=' jsonurl = urllib2.urlopen(POSTCODE_URL ).read() postcodes = json.loads(jsonurl)['data'] postcode_districts = [p[0] for p in postcodes]
def scrape_state_firms(self, state_item): self.br.open(self.url) s = BeautifulSoup(self.br.response().read()) saved_form = s.find('form', id='aspnetForm').prettify() self.br.select_form('aspnetForm') self.br.form['ctl00$ContentPlaceHolder1$drpState'] = [ state_item.name ] self.br.form.new_control('hidden', '__ASYNCPOST', {'value': 'true'}) self.br.form.new_control('hidden', 'ctl00$ScriptManager1', {'value': 'ctl00$ScriptManager1|ctl00$ContentPlaceHolder1$btnSearch'}) self.br.form.fixup() ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$btnfrmSearch') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$btnAccept') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$btnSearch') ctl.disabled = False self.br.submit() pageno = 2 while True: resp = self.br.response().read() it = iter(resp.split('|')) kv = dict(zip(it, it)) s = BeautifulSoup(kv['ctl00_ContentPlaceHolder1_pnlgrdSearchResult']) r1 = re.compile(r'^frmFirmDetails\.aspx\?FirmID=([A-Z0-9-]+)$') r2 = re.compile(r'hpFirmName$') x = {'href': r1, 'id': r2} for a in s.findAll('a', attrs=x): print 'firm name: ', a.text print 'firm url: ', urlparse.urljoin(self.br.geturl(), a['href']) print # Find next page number link a = s.find('a', text='%d' % pageno) if not a: break pageno += 1 # New __VIEWSTATE value view_state = kv['__VIEWSTATE'] # Extract new __EVENTTARGET value from next page link r = re.compile(r"__doPostBack\('([^']+)") m = re.search(r, a['href']) event_target = m.group(1) # Regenerate form for next page html = saved_form.encode('utf8') resp = mechanize.make_response(html, [("Content-Type", "text/html")], self.br.geturl(), 200, "OK") self.br.set_response(resp) self.br.select_form('aspnetForm') self.br.form.set_all_readonly(False) self.br.form['__EVENTTARGET'] = event_target self.br.form['__VIEWSTATE'] = view_state self.br.form['ctl00$ContentPlaceHolder1$drpState'] = [ state_item.name ] self.br.form.new_control('hidden', '__ASYNCPOST', {'value': 'true'}) self.br.form.new_control('hidden', 'ctl00$ScriptManager1', {'value': 'ctl00$ContentPlaceHolder1$pnlgrdSearchResult|'+event_target}) self.br.form.fixup() ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$btnfrmSearch') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$btnAccept') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$btnSearch') self.br.form.controls.remove(ctl) self.br.submit()
def do_transfer(self, account, to, amount, reason=None): """ Transfer the given amount of money from an account to another, tagging the transfer with the given reason. """ # access the transfer page transfer_page_unreachable_message = u'Could not reach the transfer page.' self.home() if not self.page.is_accounts_list(): raise TransferError(transfer_page_unreachable_message) operations_url = self.page.operations_page_url() self.location('https://%s%s' % (self.DOMAIN, operations_url)) transfer_url = self.page.transfer_page_url() abs_transfer_url = 'https://%s%s' % (self.DOMAIN, transfer_url) self.location(abs_transfer_url) if not self.page.is_transfer_page(): raise TransferError(transfer_page_unreachable_message) source_accounts = self.page.get_transfer_source_accounts() target_accounts = self.page.get_transfer_target_accounts() # check that the given source account can be used if account not in source_accounts.values(): raise TransferError( 'You cannot use account %s as a source account.' % account) # check that the given source account can be used if to not in target_accounts.values(): raise TransferError( 'You cannot use account %s as a target account.' % to) # separate euros from cents amount_euros = int(amount) amount_cents = int((amount * 100) - (amount_euros * 100)) # let's circumvent https://github.com/jjlee/mechanize/issues/closed#issue/17 # using http://wwwsearch.sourceforge.net/mechanize/faq.html#usage adjusted_response = self.response().get_data().replace( '<br/>', '<br />') response = mechanize.make_response(adjusted_response, [('Content-Type', 'text/html')], abs_transfer_url, 200, 'OK') self.set_response(response) # fill the form self.select_form(nr=0) self['numCompteEmetteur'] = [ '%s' % self.dict_find_value(source_accounts, account) ] self['numCompteBeneficiaire'] = [ '%s' % self.dict_find_value(target_accounts, to) ] self['montantPartieEntiere'] = '%s' % amount_euros self['montantPartieDecimale'] = '%02d' % amount_cents if reason is not None: self['libelle'] = reason self.submit() # look for known errors content = unicode(self.response().get_data(), 'utf-8') insufficient_amount_message = u'Montant insuffisant.' maximum_allowed_balance_message = u'Solde maximum autorisé dépassé.' if content.find(insufficient_amount_message) != -1: raise TransferError('The amount you tried to transfer is too low.') if content.find(maximum_allowed_balance_message) != -1: raise TransferError( 'The maximum allowed balance for the target account has been / would be reached.' ) # look for the known "all right" message ready_for_transfer_message = u'Vous allez effectuer un virement' if not content.find(ready_for_transfer_message): raise TransferError('The expected message "%s" was not found.' % ready_for_transfer_message) # submit the last form self.select_form(nr=0) submit_date = datetime.now() self.submit() # look for the known "everything went well" message content = unicode(self.response().get_data(), 'utf-8') transfer_ok_message = u'Vous venez d\'effectuer un virement du compte' if not content.find(transfer_ok_message): raise TransferError('The expected message "%s" was not found.' % transfer_ok_message) # We now have to return a Transfer object # the final page does not provide any transfer id, so we'll use the submit date transfer = Transfer(submit_date.strftime('%Y%m%d%H%M%S')) transfer.amount = amount transfer.origin = account transfer.recipient = to transfer.date = submit_date return transfer
def login(name, passw, date): #spawn a browser br = mechanize.Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.open(url) soup = BeautifulSoup(br.response().read(), 'lxml') html = str(soup) #handling the responses resp = mechanize.make_response(html, [("Content-Type", "text/html")], br.geturl(), 200, "OK") br.set_response(resp) br.select_form('cplogin') #print br br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.form.set_all_readonly(False) #login into the mycampus br.form.new_control('password', 'pass', {'value': passw}) br.form.new_control('text', 'user', {'value': name}) br.form.fixup() resp = br.submit() data = br.response().read() soup = BeautifulSoup(data, 'lxml') title = soup.find('title') if title.text == 'Login Successful': br.open('http://portal.mycampus.ca/cp/home/next') br.open( 'http://portal.mycampus.ca/cp/ip/login?sys=sct&url=https://ssbp.mycampus.ca/prod_uoit/twbkwbis.P_GenMenu?name=bmenu.P_RegMnu2' ) br.open( 'https://ssbp.mycampus.ca/prod_uoit/twbkwbis.P_GenMenu?name=bmenu.P_RegMnu2' ) for link in br.links(): if (link.text == 'Select Term'): print 'found' request = br.click_link(link) response = br.follow_link(link) # submit term br.select_form(nr=0) br.form.new_control('text', 'term_in', {'value': date}) br.form.fixup() resp = br.submit() return br, True else: return br, False
def scrape_state_firms(self, state_item): self.br.open(self.url) s = soupify(self.br.response().read()) saved_form = s.find('form', id='aspnetForm').prettify() self.br.select_form('aspnetForm') self.br.form.new_control('hidden', '__EVENTTARGET', {'value': ''}) self.br.form.new_control('hidden', '__EVENTARGUMENT', {'value': ''}) self.br.form.new_control('hidden', '__ASYNCPOST', {'value': 'true'}) self.br.form.new_control('hidden', 'ctl00$ScriptManager1', { 'value': 'ctl00$ScriptManager1|ctl00$ContentPlaceHolder1$btnSearch' }) self.br.form.fixup() self.br.form['ctl00$ContentPlaceHolder1$drpState'] = [state_item.name] ctl = self.br.form.find_control( 'ctl00$ContentPlaceHolder1$btnfrmSearch') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$btnAccept') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control('ctl00$ContentPlaceHolder1$btnSearch') ctl.disabled = False self.br.submit() pageno = 2 while True: r = self.br.response() s = BeautifulSoup(r.read()) r = re.compile(r'^frmFirmDetails\.aspx\?FirmID=([A-Z0-9-]+)$') for a in s.findAll('a', href=r): m = re.search(r, a['href']) g = m.group(1) if ArchitectureFirm.objects.filter(frmid=g).exists(): continue firm = ArchitectureFirm() firm.name = a.text firm.frmid = m.group(1) firm.save() print a a = s.find('a', text='%d' % pageno) if not a: break pageno += 1 r = re.compile(r'VIEWSTATE\|([^|]+)') m = re.search(r, str(s)) view_state = m.group(1) r = re.compile(r"__doPostBack\('([^']+)") m = re.search(r, a['href']) html = saved_form.encode('utf8') resp = mechanize.make_response(html, [("Content-Type", "text/html")], self.br.geturl(), 200, "OK") self.br.set_response(resp) self.br.select_form('aspnetForm') self.br.form.set_all_readonly(False) self.br.form['__EVENTTARGET'] = m.group(1) self.br.form['__VIEWSTATE'] = view_state self.br.form['ctl00$ContentPlaceHolder1$drpState'] = [ state_item.name ] self.br.form.new_control('hidden', '__ASYNCPOST', {'value': 'true'}) self.br.form.new_control( 'hidden', 'ctl00$ScriptManager1', { 'value': 'ctl00$ContentPlaceHolder1$pnlgrdSearchResult|' + m.group(1) }) self.br.form.fixup() ctl = self.br.form.find_control( 'ctl00$ContentPlaceHolder1$btnfrmSearch') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control( 'ctl00$ContentPlaceHolder1$btnAccept') self.br.form.controls.remove(ctl) ctl = self.br.form.find_control( 'ctl00$ContentPlaceHolder1$btnSearch') self.br.form.controls.remove(ctl) self.br.submit()
def scrape(self): def select_form(form): return form.attrs.get('id', None) == 'form1' self.br.open(self.url) s = soupify(self.br.response().read()) saved_form = s.find('form', id='form1') saved_form.extract() # # 1 - POST to get select options for <select name=ddlEntidadEspecialidad> # 2 - Replace <table class="contenedor"> with table in AJAX response # 3 - Select option for <select name=ddlEntidadEspecialidad> # 4 - Enter filing number # 5 - Click button btnConsultarNum # 6 - Replace <div id="divActuaciones"> with div in AJAX response # 1 self.br.select_form(predicate=select_form) self.br.form.new_control('hidden', '__EVENTTARGET', {'value': ''}) self.br.form.new_control('hidden', '__EVENTARGUMENT', {'value': ''}) self.br.form.new_control('hidden', '__ASYNCPOST', {'value': 'true'}) self.br.form.new_control('hidden', 'managerScript', {'value': ''}) self.br.form.fixup() self.br.form.set_all_readonly(False) self.br.form['ddlCiudad'] = ['11001'] # Bogata, D.C. self.br.form['managerScript'] = 'upPanelCiudad|ddlCiudad' # div#id value followed by select control name self.br.submit() # 2 d = saved_form.find('div', id='upPanelCiudad') r = self.br.response().read() it = iter(r.split('|')) kv = dict(zip(it, it)) new_table = BeautifulSoup(kv['upPanelCiudad']) old_table = d.table old_table.replace_with(new_table) # replace_with seems to break find operations so we reparse the tree saved_form = soupify(saved_form.prettify()) html = saved_form.encode('utf8') resp = mechanize.make_response(html, [("Content-Type", "text/html")], self.br.geturl(), 200, "OK") self.br.set_response(resp) self.br.select_form(predicate=select_form) self.br.form.new_control('hidden', '__ASYNCPOST', {'value': 'true'}) self.br.form.new_control('hidden', 'managerScript', {'value': ''}) self.br.form.new_control('hidden', 'txtNumeroProcesoID', {'value': ''}) # why doesn't mechanize pick this up? self.br.form.set_all_readonly(False) self.br.form['__VIEWSTATE'] = kv['__VIEWSTATE'] # update viewstate self.br.form['managerScript'] = 'managerScript|btnConsultarNum' # 3 ctl = self.br.form.find_control('ddlEntidadEspecialidad') ctl.get(label='JUZGADOS CIVILES DEL CIRCUITO DE BOGOTA').selected = True self.br.form['rblConsulta'] = ['1'] def select_control(ctl): return ctl.attrs.get('maxlength', None) == '23' # tqsurrp4n5dcbu2dtnnzjkur ctl = self.br.form.find_control(predicate=select_control) self.br.form[ctl.name] = '11001310300220140043000' self.br.form.find_control('tbxNumeroConstruido').disabled = False self.br.form['tbxNumeroConstruido'] = self.br.form[ctl.name][0:9] self.br.form['txtNumeroProcesoID'] = ctl.name self.br.submit(name='btnConsultarNum') r = self.br.response().read() it = iter(r.split('|')) kv = dict(zip(it, it)) old_div = saved_form.find('div', id='upPanelActuaciones') new_div = soupify(kv['upPanelActuaciones']) old_div.replace_with(new_div) saved_form.find('div', id='panelActuaciones').attrs['style'] = 'display: block' f = open('result.html', 'w') f.write(saved_form.prettify().encode('utf8')) f.close() print 'break...'
def do_transfer(self, account, to, amount, reason=None): """ Transfer the given amount of money from an account to another, tagging the transfer with the given reason. """ # access the transfer page transfer_page_unreachable_message = u'Could not reach the transfer page.' self.home() if not self.page.is_accounts_list(): raise TransferError(transfer_page_unreachable_message) operations_url = self.page.operations_page_url() self.location('https://%s%s' % (self.DOMAIN, operations_url)) transfer_url = self.page.transfer_page_url() abs_transfer_url = 'https://%s%s' % (self.DOMAIN, transfer_url) self.location(abs_transfer_url) if not self.page.is_transfer_page(): raise TransferError(transfer_page_unreachable_message) source_accounts = self.page.get_transfer_source_accounts() target_accounts = self.page.get_transfer_target_accounts() # check that the given source account can be used if not account in source_accounts.values(): raise TransferError('You cannot use account %s as a source account.' % account) # check that the given source account can be used if not to in target_accounts.values(): raise TransferError('You cannot use account %s as a target account.' % to) # separate euros from cents amount_euros = int(amount) amount_cents = int((amount * 100) - (amount_euros * 100)) # let's circumvent https://github.com/jjlee/mechanize/issues/closed#issue/17 # using http://wwwsearch.sourceforge.net/mechanize/faq.html#usage adjusted_response = self.response().get_data().replace('<br/>', '<br />') response = mechanize.make_response(adjusted_response, [('Content-Type', 'text/html')], abs_transfer_url, 200, 'OK') self.set_response(response) # fill the form self.select_form(nr=0) self['numCompteEmetteur'] = ['%s' % self.dict_find_value(source_accounts, account)] self['numCompteBeneficiaire'] = ['%s' % self.dict_find_value(target_accounts, to)] self['montantPartieEntiere'] = '%s' % amount_euros self['montantPartieDecimale'] = '%02d' % amount_cents if reason != None: self['libelle'] = reason self.submit() # look for known errors content = unicode(self.response().get_data(), 'utf-8') insufficient_amount_message = u'Montant insuffisant.' maximum_allowed_balance_message = u'Solde maximum autorisé dépassé.' if content.find(insufficient_amount_message) != -1: raise TransferError('The amount you tried to transfer is too low.') if content.find(maximum_allowed_balance_message) != -1: raise TransferError('The maximum allowed balance for the target account has been / would be reached.') # look for the known "all right" message ready_for_transfer_message = u'Vous allez effectuer un virement' if not content.find(ready_for_transfer_message): raise TransferError('The expected message "%s" was not found.' % ready_for_transfer_message) # submit the last form self.select_form(nr=0) submit_date = datetime.now() self.submit() # look for the known "everything went well" message content = unicode(self.response().get_data(), 'utf-8') transfer_ok_message = u'Vous venez d\'effectuer un virement du compte' if not content.find(transfer_ok_message): raise TransferError('The expected message "%s" was not found.' % transfer_ok_message) # We now have to return a Transfer object # the final page does not provide any transfer id, so we'll use the submit date transfer = Transfer(submit_date.strftime('%Y%m%d%H%M%S')) transfer.amount = amount transfer.origin = account transfer.recipient = to transfer.date = submit_date return transfer