def slurp_with_login_and_pwd(): import sys import mechanize # sys.path.append('ClientCookie-1.0.3') # from mechanize import ClientCookie # sys.path.append('ClientForm-0.1.17') # import ClientForm # Create special URL opener (for User-Agent) and cookieJar cookieJar = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent","Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) fp = mechanize.urlopen("http://login.yahoo.com") forms = mechanize.ParseResponse(fp) fp.close() # print forms on this page for form in forms: print "***************************" print form form = forms[0] form["login"] = "******" # use your userid form["passwd"] = "password" # use your password fp = mechanize.urlopen(form.click()) fp.close() fp = mechanize.urlopen("https://class.coursera.org/ml-003/lecture/download.mp4?lecture_id=1") # use your group fp.readlines() fp.close()
def main(self): keyword = remove_polish(word) openable = 1 response = urlopen(uri) forms = ParseResponse(response, backwards_compat=False) if len(forms)==0: os.system("python PyCrawler.py"+" baza.db '"+ uri+"' 1 "+ keyword) return form = forms[0] # search for text input in form and put keyword there control = form.find_control(type="text") control.value = keyword # form.click() returns a mechanize.Request object # (see HTMLForm.click.__doc__ if you want to use only the forms support, and # not the rest of mechanize) request2 = form.click() # mechanize.Request object try: response2 = urlopen(request2) except: print "Nie mozna otworzyc formularza" openable = 0 pass #get the url of page if not openable: search_url=uri else: search_url = response2.geturl() #start crawler on it os.system("python PyCrawler.py"+" baza.db '"+ search_url+"' 1 "+ keyword)
def get_vorlage(session_id, url): try: response = mechanize.urlopen(mechanize.Request(url)) pprint.pprint(response) except URLError: return forms = mechanize.ParseResponse(response, backwards_compat=False) for form in forms: # All forms are iterated. Might not all be attachment-related. for control in form.controls: if control.name == 'DT': print control.name, control.value request2 = form.click() try: response2 = mechanize.urlopen(request2) form_url = response2.geturl() if "getfile.asp" in form_url: #print "ERFOLG:", response2.info() pdf = response2.read() md5 = hashlib.md5(pdf).hexdigest() scraperwiki.sqlite.save( unique_keys=['session_id', 'dt', 'md5', 'size'], data={ 'session_id': session_id, 'dt': control.value, 'md5': md5, 'size': len(pdf) }) continue except mechanize.HTTPError, response2: print "HTTP-FEHLER :(" except URLError: pass
def getDLurl(self, url): try: content = self.getUrl(url) match = re.findall('flashvars.playlist = \'(.*?)\';', content) if match: for url in match: url = 'http://ua.canna.to/canna/' + url content = self.getUrl(url) match = re.findall('<location>(.*?)</location>', content) if match: for url in match: url = 'http://ua.canna.to/canna/' + url req = mechanize.Request( 'http://ua.canna.to/canna/single.php') response = mechanize.urlopen(req) req = mechanize.Request(url) req.add_header( 'User-Agent', ' Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3' ) response = mechanize.urlopen(req) response.close() code = response.info().getheader( 'Content-Location') url = 'http://ua.canna.to/canna/avzt/' + code print url return url except urllib2.HTTPError, error: printl(error, self, "E") message = self.session.open(MessageBox, ("Fehler: %s" % error), MessageBox.TYPE_INFO, timeout=3) return False
def rtnHTMLformat(tmpddGenrcgenPresent, sppPrefx, pthwcod, ouPthwpng): inpx = '\n'.join(tmpddGenrcgenPresent) # inpx="ALDH2 color \nALDH3A1 color" request = mechanize.Request( "http://www.genome.jp/kegg/tool/map_pathway2.html") response = mechanize.urlopen(request) forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[0] form["unclassified"] = inpx form["org"] = sppPrefx request2 = form.click() response2 = mechanize.urlopen(request2) a = str(response2.read()).split('href="/kegg-bin/show_pathway?')[1] code = a.split('/')[0] # response2.read() request = mechanize.Request( "http://www.genome.jp/kegg-bin/show_pathway?%s/%s.args" % (code, pthwcod)) # request=mechanize.Request("http://www.genome.jp/kegg-bin/show_pathway?%s/%s.args"%('13171478854246','hsa00410')) response = mechanize.urlopen(request) forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[1] status = ' NOT ' try: imgf = str(forms[1]).split('/mark_pathway')[1].split('/')[0] os.system("wget --quiet http://www.genome.jp/tmp/mark_pathway%s/%s.png -O %s" % (imgf, pthwcod, ouPthwpng)) status = ' ' except: pass return 'A pathway image was%ssuccefully produced...' % status
def getDLurl(self, url): try: content = self.getUrl(url) match = re.findall('flashvars.playlist = \'(.*?)\';', content) if match: for url in match: url = 'http://ua.canna.to/canna/'+url content = self.getUrl(url) match = re.findall('<location>(.*?)</location>', content) if match: for url in match: req = mechanize.Request('http://ua.canna.to/canna/single.php') response = mechanize.urlopen(req) url = 'http://ua.canna.to/canna/'+url req = mechanize.Request(url) req.add_header('User-Agent', canna_agent) response = mechanize.urlopen(req) response.close() code=response.info().getheader('Content-Location') url='http://ua.canna.to/canna/avzt/'+code return url except urllib2.HTTPError, error: printl(error,self,"E") message = self.session.open(MessageBoxExt, (_("Error: %s") % error), MessageBoxExt.TYPE_INFO, timeout=3) return False
def reply_to(self, postid, message): params = urllib.urlencode({ 'post_parent_id': postid, 'snId': 0, 'post_text': message }) mechanize.urlopen(self.base_url + "/actions/post.php", params)
def connect(url, username, password): try: if connected(url): raise LinkException('You are already connected') try: response = urlopen(url) except URLError: raise SSIDException('You are not connected on a FON box') forms = ParseResponse(response, backwards_compat=False) try: form = forms[0] form["login[user]"] = username form["login[pass]"] = password except IndexError: raise SSIDException('You are not connected on a FON box') try: response_page = urlopen(form.click()).read() except NameError: raise SSIDException('You are not connected on a FON box') return not 'class="form_error"' in response_page except PlainURLError: if connected(url): return True else: raise RuntimeError("Connection failed.")
def _get_results(form, dbg = False): # click the form clicked_form = form.click() # then get the results page result = mechanize.urlopen(clicked_form) #### EXPORTING RESULTS FILE # so what I do is that I fetch the first results page, # click the form/link to get all hits as a colon separated # ascii table file # get the form resultform = mechanize.ParseResponse(result, backwards_compat=False) result.close() resultform = resultform[0] # set colon as dilimeter of the table (could use anything I guess) #~ resultform.find_control('export_delimiter').items[1].selected = True resultform.find_control('export_delimiter').toggle('colon') resultform_clicked = resultform.click() result_table = mechanize.urlopen(resultform_clicked) data = result_table.read() result_table.close() if dbg: return resultform, result_table, data else: return data
def login(self, username, password): response = urlopen(urljoin(self.uri, "/?cs=login")) forms = ParseResponse(response, backwards_compat=False) form = forms[0] form.set_value(username, name='username') form.set_value(password, name='password') self.page = urlopen(form.click())
def loginRedmine(): forms = ParseResponse(urlopen(URL_LOGIN)) form = forms[0] form['username'] = USER_ID form['password'] = USER_PW request = form.click() mechanize.urlopen(request)
def connect(url, username, password): try: if connected(url): raise LinkException('You are already connected') try: response = urlopen(url) except URLError: raise SSIDException('You are not connected on a FON box') forms = ParseResponse(response, backwards_compat=False) try: form = forms[0] form["login[USERNAME]"] = username form["login[PASSWORD]"] = password except IndexError: raise SSIDException('You are not connected on a FON box') try: response_page = urlopen(form.click()).read() except NameError: raise SSIDException('You are not connected on a FON box') return not 'class="form_error"' in response_page except PlainURLError: if connected(url): return True else: raise RuntimeError("Connection failed.")
def _get_results(form, dbg=False): # click the form clicked_form = form.click() # then get the results page result = mechanize.urlopen(clicked_form) #### EXPORTING RESULTS FILE # so what I do is that I fetch the first results page, # click the form/link to get all hits as a colon separated # ascii table file # get the form resultform = mechanize.ParseResponse(result, backwards_compat=False) result.close() resultform = resultform[0] # set colon as dilimeter of the table (could use anything I guess) #~ resultform.find_control('export_delimiter').items[1].selected = True resultform.find_control('export_delimiter').toggle('colon') resultform_clicked = resultform.click() result_table = mechanize.urlopen(resultform_clicked) data = result_table.read() result_table.close() if dbg: return resultform, result_table, data else: return data
def test_sending_headers(self): handler = self._make_request_handler([(200, [], "we don't care")]) req = mechanize.Request("http://localhost:%s/" % handler.port, headers={'Range': 'bytes=20-39'}) mechanize.urlopen(req) self.assertEqual(handler.received_headers['Range'], 'bytes=20-39')
def get_vorlage(session_id, url): try: response = mechanize.urlopen(mechanize.Request(url)) pprint.pprint(response) except URLError: return forms = mechanize.ParseResponse(response, backwards_compat=False) for form in forms: # All forms are iterated. Might not all be attachment-related. for control in form.controls: if control.name == 'DT': print control.name, control.value request2 = form.click() try: response2 = mechanize.urlopen(request2) form_url = response2.geturl() if "getfile.asp" in form_url: #print "ERFOLG:", response2.info() pdf = response2.read() md5 = hashlib.md5(pdf).hexdigest() scraperwiki.sqlite.save( unique_keys=['session_id', 'dt', 'md5', 'size'], data={'session_id': session_id, 'dt': control.value, 'md5': md5, 'size': len(pdf)}) continue except mechanize.HTTPError, response2: print "HTTP-FEHLER :(" except URLError: pass
def __get_csv(self, letter='a', now=False): #open the url current_url = self.overview_url + '1111&b=' + letter overview_req = mechanize.Request(current_url) overview_res = mechanize.urlopen(overview_req) #find the list of entries to post py_query = PyQuery(overview_res.read()) titlelist = py_query("input[name='titelnrliste']").val() #create the post request post_data = { 'url': current_url, 'download': '[Download]', 'titelnrliste': titlelist } if (now): #find the checked box (the current quartal) default_quartal = py_query(".quartal input:checked").attr('name') post_data[str(default_quartal)] = 'ON' else: #enable all quartal's checkbox quartals = [1, 2, 3, 4] for i in quartals: if i in range(1, 5): post_data[str(self.year) + str(i)] = 'ON' #send the post request csv_req = mechanize.Request(current_url, urllib.urlencode(post_data)) csv_res = mechanize.urlopen(csv_req) self.csv_parser.process_result(response=csv_res)
def login1(self): self.brow = mechanize.Browser() httpHandler = mechanize.HTTPHandler() httpsHandler = mechanize.HTTPSHandler() httpHandler.set_http_debuglevel(DEBUG) self.cookiejar = mechanize.LWPCookieJar() #self.cookiejar = "Cookie lzstat_uv=34741959842666604402|1786789; Hm_lvt_976797cb85805d626fc5642aa5244ba0=1304534271541; ASPSESSIONIDQCDRAQBB=JHCHINLAHGMAIGBIFMNANLGF; lzstat_ss=2189193215_2_1304564199_1786789; Hm_lpvt_976797cb85805d626fc5642aa5244ba0=1304535401191" self.opener = mechanize.OpenerFactory(mechanize.SeekableResponseOpener).build_opener( httpHandler,httpsHandler, mechanize.HTTPCookieProcessor(self.cookiejar), mechanize.HTTPRefererProcessor, mechanize.HTTPEquivProcessor, mechanize.HTTPRefreshProcessor, ) self.opener.addheaders = [("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13"), ("From", "")] #self.opener.addheaders = [( # "Referer", self.data['postUrl'] # )] login={} login['method'] = self.data['method'] login['name'] = self.data['name'] login['pwd'] = self.data['pwd'] loginUrl = self.data['loginUrl']+'?'+urllib.urlencode(login) print loginUrl response = mechanize.urlopen("http://esf.soufun.com/") response = mechanize.urlopen(loginUrl) print response.read().decode('gb2312')
def slurp_with_login_and_pwd(): import sys import mechanize # sys.path.append('ClientCookie-1.0.3') # from mechanize import ClientCookie # sys.path.append('ClientForm-0.1.17') # import ClientForm # Create special URL opener (for User-Agent) and cookieJar cookieJar = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) fp = mechanize.urlopen("http://login.yahoo.com") forms = mechanize.ParseResponse(fp) fp.close() # print forms on this page for form in forms: print "***************************" print form form = forms[0] form["login"] = "******" # use your userid form["passwd"] = "password" # use your password fp = mechanize.urlopen(form.click()) fp.close() fp = mechanize.urlopen( "https://class.coursera.org/ml-003/lecture/download.mp4?lecture_id=1" ) # use your group fp.readlines() fp.close()
def getDLurl(self, url): try: content = self.getUrl(url) match = re.findall('flashvars.playlist = \'(.*?)\';', content) if match: for url in match: url = 'http://ua.canna.to/canna/'+url content = self.getUrl(url) match = re.findall('<location>(.*?)</location>', content) if match: for url in match: url = 'http://ua.canna.to/canna/'+url req = mechanize.Request('http://ua.canna.to/canna/single.php') response = mechanize.urlopen(req) req = mechanize.Request(url) req.add_header('User-Agent', ' Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3') response = mechanize.urlopen(req) response.close() code=response.info().getheader('Content-Location') url='http://ua.canna.to/canna/avzt/'+code print url return url except urllib2.HTTPError, error: printl(error,self,"E") message = self.session.open(MessageBox, ("Fehler: %s" % error), MessageBox.TYPE_INFO, timeout=3) return False
def login(conf): try: username = conf.username password = conf.password request = mechanize.Request('%s/login.php' % conf.website) response = mechanize.urlopen(request, timeout=conf.timeout) forms = mechanize.ParseResponse(response) response.close() if len(forms) < 3: return Left('Failed to reach the login page.') form = forms[2] form['username'] = username form['password'] = password login_request = form.click() login_response = mechanize.urlopen(login_request, timeout=conf.timeout) logged_in = login_response.geturl() == ('%s/index.php' % conf.website) if not logged_in: return Left('Failed to log in with these credentials') except mechanize.HTTPError as resp: return Left('HTTPError when logging in: %s' % resp) except Exception as e: return Left('%s' % e) if conf.verbose: sys.stdout.write('Logged in as %s\n' % username) return Right('Logged in as %s' % username)
def getPropertyPins(streetName): url = r'https://taxcommissioner.dekalbcountyga.gov/TaxCommissioner/TCSearch.asp' request = mechanize.Request(url) response = mechanize.urlopen(request) forms = mechanize.ParseResponse(response, backwards_compat=False) response.close() form = forms[0] form['StreetName'] = sys.argv[1] propertyList = mechanize.urlopen(form.click()).read() tree = html.fromstring(propertyList) pins = tree.xpath('//tr/td[1]/a/@href') addresses = tree.xpath('//tr/td[1]/a/text()') pinList = [] i = 0 for pin in pins: #print pin newpin = pin.split('=') pinList.append([newpin[3], addresses[i]]) print newpin[3] + '\t' + addresses[i] i = i + 1 return pinList
def generate_script_sieve(request, group_id): group = get_object_or_404(Group, id=group_id) if request.method == 'POST': if group.always_data_id: # There is always_data mail id from mechanize import ParseResponse, urlopen, urljoin response = urlopen("https://admin.alwaysdata.com/login/") forms = ParseResponse(response, backwards_compat=False) login_form = forms[0] if settings.DEBUG: print login_form login_form["email"] = settings.ALWAYS_DATA_ID login_form["password"] = settings.ALWAYS_DATA_PASSWORD response = urlopen(login_form.click()) url = 'https://admin.alwaysdata.com/email/%d/' % group.always_data_id response = urlopen(url) forms = ParseResponse(response, backwards_compat=False) if settings.DEBUG: for form in forms: print form try: email_form = forms[1] except IndexError: messages.warning(request, _(u'%(group)s is not bind to alwaysdata yet (wrong password)' % {'group': group})) return HttpResponseRedirect(reverse("group-detail", args=[group.pk])) email_form['sieve_filter'] = request.POST['filter_sieve'].encode('utf-8') req = email_form.click() req.add_header("Referer", url) response = urlopen(req) messages.success(request, _(u'Alwaysdata has been updated')) else: messages.warning(request, _(u'%(group)s is not bind to alwaysdata yet' % {'group': group})) return HttpResponseRedirect(reverse("group-detail", args=[group.pk])) else: filter_sieve = export_sieve_configuration(group.contacts.all()) context = get_global_context_data(Group, Group._meta.app_label) context['object_list'] = Group.objects.all() context['object'] = group context['filter_sieve'] = filter_sieve return render_to_response('contact/contact-sieve.html', context, context_instance=RequestContext(request))
def test_404(self): expected_response = 'Bad bad bad...' handler = self._make_request_handler([(404, [], expected_response)]) try: mechanize.urlopen('http://localhost:%s/weeble' % handler.port) except mechanize.URLError, f: pass
def get_nyc_legislation(): #search_terms='' book = xlwt.Workbook(encoding='utf-8', style_compression = 0) sheet = book.add_sheet('Legi', cell_overwrite_ok = True) row=-1 for items in ['smoking']: url = r'http://legistar.council.nyc.gov/Legislation.aspx' request = mechanize.Request(url) response = mechanize.urlopen(request) forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[0] response.close() form['ctl00$ContentPlaceHolder1$txtSearch'] = items submit_page = mechanize.urlopen(form.click()) soup = BeautifulSoup(submit_page.read()) for link in soup.find_all("a"): legislation = link.get("href") try: if 'LegislationDetail' in legislation: url_stem = 'http://legistar.council.nyc.gov/' url2 = "%s%s" % (url_stem, legislation) request2 = requests.get(url2) soup2 = BeautifulSoup(request2.content) type = soup2.find_all("span",{"id":"ctl00_ContentPlaceHolder1_lblType2"}) status = soup2.find_all("span",{"id":"ctl00_ContentPlaceHolder1_lblStatus2"}) print url2 if ((type[0].text == "Resolution" or type[0].text == "Introduction") and (status[0].text == "Adopted")): legislation_title = soup2.find_all("span",{"id":"ctl00_ContentPlaceHolder1_lblName2"}) legislation_date = soup2.find_all("span",{"id":"ctl00_ContentPlaceHolder1_lblOnAgenda2"}) legislation_committee = soup2.find_all("a",{"id":"ctl00_ContentPlaceHolder1_hypInControlOf2"}) legislation_text = soup2.find_all("span",{"class":"st1"}) legi_url, title, date, committee, text = ([] for i in range(5)) row = row + 1 legi_url = url2 for item in legislation_title: title.append(item.text) for item in legislation_date: date.append(item.text) for item in legislation_committee: committee.append(item.text) for item in legislation_text: text.append(' '+item.text) legi = [legi_url,title,date,committee,text] for column, var_observ in enumerate(legi): sheet.write (row, column, var_observ) except: pass book.save("legislation_data.xls")
def install_phpBB(): print "installing phpBB..." #create forum url = "http://" + ip + "/phpBB3/" install_url = url + "install/index.php?mode=install&sub=" br = mechanize.Browser() post = "dbms=mysqli&dbhost=" + ip + "&dbport=" + port + "&dbname=cryptdb_phpbb&dbuser=root&dbpasswd=letmein&table_prefix=phpbb_&admin_name=admin&admin_pass1=letmein&admin_pass2=letmein&[email protected]&[email protected]" config = mechanize.urlopen(install_url+"config_file", data=post); br.set_response(config) post += "&email_enable=1&smtp_delivery=0&smtp_host=&smtp_auth=PLAIN&smtp_user=&smtp_pass=&cookie_secure=0&force_server_vars=0&server_protocol=http://&server_name=18.26.5.16&server_port=80&script_path=/phpBB" advanced = mechanize.urlopen(install_url+"advanced", data=post); br.set_response(advanced) br.select_form(nr=0) br.submit() br.select_form(nr=0) br.submit() os.system("mv $EDBDIR/../apps/phpBB3/install $EDBDIR/../apps/phpBB3/install2") print "logging in..." #login br.open(url+"ucp.php?mode=login") br.select_form(nr=1) br["username"] = "******" br["password"] = "******" br.submit() print "to ACP..." #authenticate to go to ACP br.follow_link(text="Administration Control Panel") br.select_form(nr=1) i = str(br.form).find("password") j = str(br.form).find("=)",i) br[str(br.form)[i:j]] = "letmein" br.submit() print "getting permissions page..." #navigate to group permissions br.follow_link(text="Permissions") br.follow_link(text="Groups\xe2\x80\x99 permissions") #select Newly Registered Users br.select_form(nr=0) br["group_id[]"] = ["7"] br.submit() #set all permissions to yes print "setting permissions..." br.select_form(nr=1) i = 1 while i > 0: start = str(br.form).find("setting[7][0][",i) if (start < 0): break end = str(br.form).find("=[",start) if (end < 0): break br[str(br.form)[start:end]] = ["1"] i = end br.submit()
def doLogin(): url = "https://awesome-hd.net/login.php" response = mechanize.urlopen(url) forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[0] form["username"] = s.username form["password"] = s.password mechanize.urlopen(form.click())
def isThereInternet(self): try: mechanize.urlopen('http://google.com', timeout=1) except mechanize.URLError as e: print "There is no internet: {}".format(e) self.error_msg.setText("You are not connected to the internet") self.error_msg.setDetailedText("This feature will not work without an internet connection. ") self.error_msg.exec_() return False else: return True
def isThereInternet(self): try: mechanize.urlopen('http://github.com', timeout=1) return True except mechanize.URLError as e: print "There is no internet {}".format(e) self.error_msg.setText("You are not connected to the internet") self.error_msg.setDetailedText("This feature will not work without an internet connection. ") self.error_msg.exec_() return False
def isThereInternet(self): try: mechanize.urlopen('http://github.com', timeout=1) return True except mechanize.URLError as e: print "There is no internet {}".format(e) self.error_msg.setText("No internet") self.error_msg.setDetailedText( "If this keeps happening, it means easy repo stumbled upon a " "forbidden link. You might need to change your search string") self.error_msg.exec_() return False
def randWord(wordType, wordComplexity): randURL = 'http://watchout4snakes.com' response = urlopen(urljoin(randURL, '/wo4snakes/Random/RandomWordPlus')) forms = ParseResponse(response, backwards_compat=False) form = forms[0] form['Pos'] = [wordType] form['Level'] = [wordComplexity] return urlopen(form.click()).read()
def get_bclerk_results_text(case): print('get_bclerk_results_text(' + case + ')') uri = 'http://web1.brevardclerk.us/oncoreweb/search.aspx' response = urlopen(uri, timeout=5) forms = ParseResponse(response, backwards_compat=False) form = forms[0] # print form form["txtCaseNumber"] = case #"orozco" form["SearchType"] = 'casenumber' #"orozco" form["txtDocTypes"] = '' #'JRP, J' #"orozco" # form["txtName"] = "orozco" # time.sleep(1) bclerk_results_text = urlopen(form.click()).read() return bclerk_results_text
def get_rows_from_graframe_url(url3, radius): # print('get_rows_from_graframe_url(%s, %s)' % (url3, radius)) r3 = urlopen(url3) forms = ParseResponse(r3, backwards_compat=False) # for f in forms: # print f form = forms[0] # print(form) form["select"] = ['Sales'] form["radius"] = [radius] # print(form) rows = get_nearby_from_input(urlopen(form.click())) # pprint.pprint(rows) return rows
def get_bclerk_results_text(case): print('get_bclerk_results_text('+case+')') uri = 'http://web1.brevardclerk.us/oncoreweb/search.aspx' response = urlopen(uri, timeout=5) forms = ParseResponse(response, backwards_compat=False) form = forms[0] # print form form["txtCaseNumber"] = case #"orozco" form["SearchType"] = 'casenumber' #"orozco" form["txtDocTypes"] = ''#'JRP, J' #"orozco" # form["txtName"] = "orozco" # time.sleep(1) bclerk_results_text = urlopen(form.click()).read() return bclerk_results_text
def get_rows_from_graframe_url(url3, radius): # print('get_rows_from_graframe_url(%s, %s)' % (url3, radius)) r3 = urlopen(url3) forms = ParseResponse( r3, backwards_compat=False) # for f in forms: # print f form = forms[0] # print(form) form["select"] = ['Sales'] form["radius"] = [radius] # print(form) rows=get_nearby_from_input(urlopen(form.click())) # pprint.pprint(rows) return rows
def grab_redirect(link): response = mechanize.urlopen(link['href']) forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[0] data = mechanize.urlopen(form.click()).read() soup = BeautifulSoup.BeautifulSoup(data) for div in soup('div'): if 'class' in dict(div.attrs) and \ div['class'] == 'urlworkaround': txt = ''.join([str(x) for x in div.contents]) lsoup = BeautifulSoup.BeautifulSoup(txt) link = lsoup('a')[0] return link['href'] raise Exception('no href')
def getStartPageAndInputPassword(): my_browser = mechanize.Browser(factory=mechanize.RobustFactory()) my_browser.set_handle_robots(False) request = mechanize.Request("http://www.bokat.se/protected/groupInfo.jsp") response = mechanize.urlopen(request) forms = mechanize.ParseResponse(response, backwards_compat=False) loginForm = forms[0] loginForm["j_username"] = "******" loginForm["j_password"] = "******" try: response = mechanize.urlopen(loginForm.click()) except HTTPError, e: sys.exit("post failed: %d: %s" % (e.code, e.msg))
def doSearch(movieTitle, movieYear): # Convert non-unicode characters movieTitle = removeNonUnicodeChars(movieTitle) url = "https://awesome-hd.net/torrents.php" response = mechanize.urlopen(url) # Search forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[0] form["searchstr"] = movieTitle html = mechanize.urlopen(form.click()).readlines() return html
def start_cloning(options): link = options['link'] user = options['user'] password = options['password'] response = mechanize.urlopen(link) forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[0] form['txtIdentifiant'] = user form['txtMDP'] = password website = mechanize.urlopen(form.click()) data = website.read() outfile = open('index.html', 'wt') print >> outfile, """ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html dir="ltr" lang="fr" xml:lang="fr" xmlns="http://www.w3.org/1999/xhtml" class="yui3-js-enabled" id="yui_3_2_0_1_1326674808791714"> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> """ soup = BeautifulSoup.BeautifulSoup(data) title = soup('title') print >> outfile, str(title[0]) divs = soup('div') for div in divs: if 'class' in dict(div.attrs): if div['class'] == 'course-content': vstr = '\n'.join([str(x) for x in div.contents[1:]]) # Eliminate wrong divs lsoup = BeautifulSoup.BeautifulSoup(vstr) for ldiv in lsoup.findAll('div'): if ('class' in dict(ldiv.attrs) and ldiv['class'] in ['left side', 'right side', 'jumpmenu']): ldiv.extract() replace = {} for link in lsoup.findAll('a'): if 'href' in dict(link.attrs): try: replace[link['href']] = grab_redirect(link) except: pass page_txt = str(lsoup) for k, v in replace.items(): nw_key = str(k) + "&redirect=1" page_txt = page_txt.replace(nw_key, str(v)) page_txt = page_txt.replace(str(k), str(v)) print >> outfile, page_txt outfile.close()
def get_attachment_file(self, attachment, form): """ Loads the attachment file from the server and stores it into the attachment object given as a parameter. The form parameter is the mechanize Form to be submitted for downloading the attachment. The attachment parameter has to be an object of type model.attachment.Attachment. """ time.sleep(self.config.WAIT_TIME) logging.info("Getting attachment '%s'", attachment.identifier) if self.options.verbose: print "Getting attachment '%s'" % attachment.identifier mechanize_request = form.click() try: mform_response = mechanize.urlopen(mechanize_request) mform_url = mform_response.geturl() if self.list_in_string(self.urls['ATTACHMENT_DOWNLOAD_TARGET'], mform_url): attachment.content = mform_response.read() attachment.mimetype = magic.from_buffer(attachment.content, mime=True) attachment.filename = self.make_attachment_filename( attachment.identifier, attachment.mimetype) else: logging.warn("Unexpected form target URL '%s'", mform_url) if self.options.verbose: sys.stderr.write("Unexpected form target URL '%s'\n" % mform_url) except mechanize.HTTPError as e: logging.warn("HTTP Error: code %s, info: %s", e.code, e.msg) if self.options.verbose: print "HTTP-FEHLER:", e.code, e.msg return attachment
def send_to_WEBFORM(img_file): printlog("Send WEB FORM test") myWEB = 'http://185.139.68.199/timebox/default/display_form.html' FILENAME = '' FILENAME = img_file printlog('Send test' + img_file) br = mechanize.Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] tri = 0 connect = False while not connect: try: rr = br.open(myWEB) response = urlopen( "http://185.139.68.199/timebox/default/display_form.html") br.select_form(nr=0) br['cam_id'] = ['1'] connect = True except mechanize.URLError as e: print e.reason.args tri += 1 if rti > 4: exit() sleep(20) br.form.add_file(open(FILENAME), 'text/plain', FILENAME) br.form.set_all_readonly(False) br.submit()
def get_attachment_file(self, attachment, form): """ Loads the attachment file from the server and stores it into the attachment object given as a parameter. The form parameter is the mechanize Form to be submitted for downloading the attachment. The attachment parameter has to be an object of type model.attachment.Attachment. """ time.sleep(self.config.WAIT_TIME) logging.info("Getting attachment '%s'", attachment.identifier) if self.options.verbose: print "Getting attachment '%s'" % attachment.identifier mechanize_request = form.click() try: mform_response = mechanize.urlopen(mechanize_request) mform_url = mform_response.geturl() if self.list_in_string(self.urls['ATTACHMENT_DOWNLOAD_TARGET'], mform_url): attachment.content = mform_response.read() attachment.mimetype = magic.from_buffer(attachment.content, mime=True) attachment.filename = self.make_attachment_filename(attachment.identifier, attachment.mimetype) else: logging.warn("Unexpected form target URL '%s'", mform_url) if self.options.verbose: sys.stderr.write("Unexpected form target URL '%s'\n" % mform_url) except mechanize.HTTPError as e: logging.warn("HTTP Error: code %s, info: %s", e.code, e.msg) if self.options.verbose: print "HTTP-FEHLER:", e.code, e.msg return attachment
def get_html(url, file_html = None): ''' Faz o download do HTML referente a uma URL. @url: URL a ser extraido o HTML @file_html: Arquivo em que deve ser gravado o HTML proveniente da URL passada. Caso seja nulo, o arquivo não é gravado. @return: O HTML referente à URL informada como parâmetro. ''' try: link_expandido = mechanize.urlopen(url, timeout=45.0).geturl()#Expande a url antes de ser tratada e, caso seja uma propaganda, a url da notícia é retornado browser.open(link_expandido, timeout=45.0)# Acesso à url html = browser.response().read().replace('\r', '\n').replace('DIV', 'div') #Padroniza as quebras de linha e divs except: try: #Caso ocorra algum erro com o Mechanize html = urllib2.urlopen(url).read() except: browser.close() return 'TIMEOUT'#Página não pode ser carregada - Espera-se que esta parte não seja executada! #Salva o HTML em um arquivo especificado como parâmetro if file_html != None: arq = open(file_html, "w") print >> arq, html arq.close() browser.close() return html
def get_ics_calendar(url=URL): """ Get icalendar from website. """ response = mechanize.urlopen(url) response = response.read() return response
def readUrl(inUrl): tryCount = 0 while tryCount < 5 : # print "Create CookieJar" cookies = mechanize.CookieJar() # print "Build Opener" opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) # print "Add Headers" opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; MyProgram/0.1)"),("From", "*****@*****.**")] # print "Install Opener" mechanize.install_opener(opener) try: # print "Open URL" response = mechanize.urlopen(inUrl) tryCount = 99 except: tryCount += 1 print "******** Error on urlopen ***********" print "URL: ", inUrl print "Trying Again....", tryCount # print response.read() # html = urllib.urlopen(inUrl).read() # print "Reading Response" html = response.read() # print "Response Read:", html[0:100] root = lxml.html.fromstring(html) # print "Root created: ", root return root
def getUrl(self,url): req = mechanize.Request(url) req.add_header('User-Agent', canna_agent) response = mechanize.urlopen(req) link = response.read() response.close() return link
def parse(str_to_parse=""): results = [] tree = html.fromstring(str_to_parse) tbl = tree.xpath(table_xpath)[0] for row in tbl.xpath(row_xpath): result_dict = {} if within_date_range(row) and within_price_range(row): ln = row.xpath('./td[2]/a')[0] result_dict['url'] = ln.get('href') result_dict['prop_name'] = ln.text.strip() pg_response = urlopen(result_dict['url']) new_tree = html.fromstring(pg_response.read()) adparams = new_tree.xpath('//table[@class="AdParams"]//tr/td') #print 'adparams is %s' % adparams for param in adparams: txt = ''.join(param.xpath('.//text()')).strip().replace( '\n', '').replace('\t', '').split(':') result_dict[txt[0]] = txt[1] #print 'result_dict is %s' % result_dict results.append(result_dict) return results
def getUrl(self,url): req = mechanize.Request(url) req.add_header('User-Agent', ' Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3') response = mechanize.urlopen(req) link = response.read() response.close() return link
def downloadtable(url): br = mechanize.urlopen(url) soup = BeautifulSoup.BeautifulSoup(br.read()) table = soup.find(id='myTable') for row in table('tr'): yield [i.string for i in row('td')]
def get(request, offset): import mechanize import json allow = offset dont_allow = "" if "+not+" in offset: [allow, dont_allow] = offset.split("+not+") s = "" for word in allow.split("+"): s += "&allowedIngredient[]=%s" % (word) for word in dont_allow.split("+"): s += "&excludeIngredient[]=%s" % (word) url = "http://api.yummly.com/v1/api/recipes?_app_id=%s&_app_key=%s%s&requirePictures=true" % (APP_ID, APP_KEY, s) res = mechanize.urlopen(url) page = "".join(str(line) for line in res) result = json.loads(page) results = [] for re in result["matches"]: re["smallImageUrls"] = [re["smallImageUrls"][0].replace(".s.", ".l.")] name = re["recipeName"] if len(name) > 40: name = name[:37] + "..." re["recipeName"] = name results.append(re) return render_to_response( "search.html", {"search_results": results, "term": offset.replace("+", ", ").title()}, context_instance=RequestContext(request), )
def test_geturl(self): # Make sure same URL as opened is returned by geturl. handler = self._make_request_handler([(200, [], "we don't care")]) open_url = mechanize.urlopen("http://localhost:%s" % handler.port) url = open_url.geturl() self.assertEqual(url, "http://localhost:%s" % handler.port)
def _scrapeUrl(self, url): """scrape a generic url """ #grab the data -- go internets! request3 = mechanize.Request(url) self.cj.add_cookie_header(request3) response3 = mechanize.urlopen(request3) maincontent = response3.read() #make the soup soup = BeautifulSoup(maincontent) #parse the soup #This thing is a beast # date/times and games are intersperced # The first thing should be a date # then all games following are on that date # So - we find all dates and games with our query and handle them # as they happen in order date=None tags = soup.findAll(**{'class':["schedules-list-date", 'schedules-list-hd pre', 'schedules-list-hd post']}) print "found %s tags" %len(tags) for tag in tags: #we got a date! if tag['class']=='schedules-list-date': #we've found a new date gameDateStr = str(tag.find('span').text) monthStr, date = gameDateStr.split(',')[1].strip().split() monthNum = self.MONTH_MAP[str(monthStr)] if monthNum in (1,2): year = self.year+1 else: year = self.year dateInt = int(''.join([x for x in date if x.isdigit()])) date = datetime.date(year, monthNum, dateInt) else: #we've got a new game -parse out home and away team home = str(tag.find(**{'class':['team-name home ', 'team-name home lost']}).text) away = str(tag.find(**{'class':['team-name away ', 'team-name away lost']}).text) #need to get the time as well time = str(tag.find(**{'class':'time'}).text) if time=='FINAL': print "CANNOT GET VALID TIME FOR GAMES that are in the past" hr=0 minute=0 else: hr, minute = time.split(':') amPm = str(tag.find(**{'class':['am', 'pm']}).text).strip() hr = int(hr) minute=int(minute) #adjust times to take into account am/pm if amPm=="PM" and hr <12: hr+=12 if amPm=="AM" and hr==12: hr=0 d={'week':self.week, 'home':self.TEAM_MAP[home], 'away':self.TEAM_MAP[away], 'kickoff':datetime.datetime(date.year, date.month, date.day, hr, minute, tzinfo=self.EASTERN_TIME_ZONE)} self.games.append(d)