def test_urlretrieve(self): url = urljoin(self.uri, "/mechanize/") test_filename = "python.html" def check_retrieve(opener, filename, headers): self.assertEqual(headers.get('Content-Type'), 'text/html') f = open(filename) data = f.read() f.close() opener.close() from urllib import urlopen r = urlopen(url) self.assertEqual(data, r.read()) r.close() opener = mechanize.build_opener() verif = CallbackVerifier(self) filename, headers = opener.retrieve(url, test_filename, verif.callback) try: self.assertEqual(filename, test_filename) check_retrieve(opener, filename, headers) self.assert_(os.path.isfile(filename)) finally: os.remove(filename) opener = mechanize.build_opener() verif = CallbackVerifier(self) filename, headers = opener.retrieve(url, reporthook=verif.callback) check_retrieve(opener, filename, headers) # closing the opener removed the temporary file self.failIf(os.path.isfile(filename))
def slurp_with_login_and_pwd(): import sys import mechanize # sys.path.append('ClientCookie-1.0.3') # from mechanize import ClientCookie # sys.path.append('ClientForm-0.1.17') # import ClientForm # Create special URL opener (for User-Agent) and cookieJar cookieJar = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) fp = mechanize.urlopen("http://login.yahoo.com") forms = mechanize.ParseResponse(fp) fp.close() # print forms on this page for form in forms: print "***************************" print form form = forms[0] form["login"] = "******" # use your userid form["passwd"] = "password" # use your password fp = mechanize.urlopen(form.click()) fp.close() fp = mechanize.urlopen( "https://class.coursera.org/ml-003/lecture/download.mp4?lecture_id=1" ) # use your group fp.readlines() fp.close()
def retrieve_product_data(self, product_link): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) browser = mechanize.Browser() product_data = browser.open(product_link).get_data() soup = BeautifulSoup(product_data) product_name = soup.find('title').string.encode('ascii', 'ignore') product_prices = soup.find('div', 'price').contents try: cash_price = int(clean_price_string(product_prices[4])) product_data = ProductData() product_data.custom_name = product_name product_data.price = cash_price product_data.url = product_link product_data.comparison_field = product_link return product_data except IndexError: return None
def constructUploadName(loginname, requestedfilename): # construct name import random filename = os.path.split(requestedfilename)[1] filename = filename[:string.find(filename, ".")] + ".jpg" # construct jpg extension resultName = string.lower(loginname + "_" + APP_VERSION_STR + "_" + filename) # prepend loginname resultName = string.replace(resultName, " ", "_") # replace spaces resultName = string.replace(resultName, "'", "_") # replace ' # resultName = urllib.quote(resultName) #make safe url theResult = "" for theChar in resultName: if theChar in ALLOWED_CHARS: theResult += theChar resultName = theResult # check whether ok # build opener. Can be extended to handle cookies/proxies opener = mechanize.build_opener() # goto upload page request3 = mechanize.Request(FORUM_UPLOAD_URL) response3 = opener.open(request3) # page = string.lower(response3.read()) response3.close() random.seed() # while not name ok, permutate for _i in range(6): resultName = str(random.random())[-1] + resultName # prepend with random number # while string.find(page,resultName)<>-1: return resultName
def uploadFileToAquaforum(uploadFilename, requestedFileName): ''' returns response page ''' # build opener. Can be extended to handle cookies/proxies opener = mechanize.build_opener() # goto upload page request3 = mechanize.Request(FORUM_UPLOAD_URL) response3 = opener.open(request3) # parse form on upload page and add file forms = mechanize.ParseResponse(response3, backwards_compat=False) form = forms[0] filectr = form.find_control("imgfile") # filectr.add_file(open('/home/jasper/avatar.jpg'),"image/jpeg","avatar.jpg") theFile = file(uploadFilename, 'rb') filectr.add_file(theFile, "image/jpeg", os.path.split( requestedFileName)[-1]) # obtain form data request4 = form.click() # urllib2.Request object theFile.close() request4.add_header('Referer', response3.geturl()) response4 = opener.open(request4) return response4.read()
def __init__(self, feed_id, logging, keyfile="xively.key", timeout=5, uptime=False): threading.Thread.__init__(self) # private key stored in a file try: api_key = open(keyfile).readlines()[0].strip() except IOError as e: raise Xively_Exception("missing api key file: xively.key") self.feed_id = feed_id self.timeout = timeout self.logger = logging.getLogger('xively') self.opener = mechanize.build_opener() self.opener.addheaders = [('X-ApiKey', api_key)] self.data = [] self.payload = {} if uptime: self.add_uptime()
def __init__(self, username="******", password="******"): self.username = "******"+username self.password = password self.password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() ntlm_auth = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler(self.password_manager) opener = mechanize.build_opener(ntlm_auth) mechanize.install_opener(opener)
def __init__(self, feed_id, apikey): self._version = "1.0.0" self._feed_id = feed_id self._opener = mechanize.build_opener() self._opener.addheaders = [('X-ApiKey', apikey)] self._data = [] self._payload = {}
def readUrl(inUrl): tryCount = 0 while tryCount < 5 : # print "Create CookieJar" cookies = mechanize.CookieJar() # print "Build Opener" opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) # print "Add Headers" opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; MyProgram/0.1)"),("From", "*****@*****.**")] # print "Install Opener" mechanize.install_opener(opener) try: # print "Open URL" response = mechanize.urlopen(inUrl) tryCount = 99 except: tryCount += 1 print "******** Error on urlopen ***********" print "URL: ", inUrl print "Trying Again....", tryCount # print response.read() # html = urllib.urlopen(inUrl).read() # print "Reading Response" html = response.read() # print "Response Read:", html[0:100] root = lxml.html.fromstring(html) # print "Root created: ", root return root
def __callRequest(self): cookieJar = mechanize.LWPCookieJar() try: #TODO ohne try evtl. cookieJar.load(self._cookiePath, self.__bIgnoreDiscard, self.__bIgnoreExpired) except Exception as e: logger.info(e) sParameters = urllib.urlencode(self.__aParameters) opener = mechanize.build_opener(SmartRedirectHandler, mechanize.HTTPEquivProcessor, mechanize.HTTPRefreshProcessor) if (len(sParameters) > 0): oRequest = mechanize.Request(self.__sUrl, sParameters) else: oRequest = mechanize.Request(self.__sUrl) for aHeader in self.__aHeaderEntries: for sHeaderKey, sHeaderValue in aHeader.items(): oRequest.add_header(sHeaderKey, sHeaderValue) cookieJar.add_cookie_header(oRequest) if self.caching and self.cacheTime > 0: sContent = self.readCache(self.getRequestUri()) if sContent: return sContent try: oResponse = opener.open(oRequest,timeout = 60) except mechanize.HTTPError, e: if not self.ignoreErrors: xbmcgui.Dialog().ok('xStream','Fehler beim Abrufen der Url:',self.__sUrl, str(e)) logger.error("HTTPError "+str(e)+" Url: "+self.__sUrl) return '' else: oResponse = e
def _parsingRobotsFile(self, url): """ Setup internal state after downloading robots.txt If urlopen.code in (401, 403), all user-agent is disallowed. -> 삭제 If urlopen.code >= 400, all user-agent is allowd. """ domain_name = urlparse.urlparse(url)[1] rules = RobotsTextRules() #getLogger().debug("++Trying to download: %s", url) opener = mechanize.build_opener(mechanize.HTTPRefreshProcessor,) rq = mechanize.Request(url) rq.add_header("User-agent", "Mozilla/5.0 (compatible; Windows NT 6.1?; ZumBot/1.0; http://help.zum.com/inquiry)") #shttp.setRequestHeader(user_agent = USER_AGENT) rs = None try: rs = opener.open(rq) header = rs.info() rules.return_code = rs.code except Exception, msg: try: if not url.startswith("http://www."): t_url = url.replace("http://", "http://www.") rq = mechanize.Request(t_url ) rq.add_header("User-agent", "Mozilla/5.0 (compatible; Windows NT 6.1?; ZumBot/1.0; http://help.zum.com/inquiry)") rs = opener.open(rq) header = rs.info() rules.return_code = rs.code except Exception, msg: return rules
def __init__(self, feed_id, apikey): self._version = "1.0.0" self._feed_id = feed_id self._opener = mechanize.build_opener() self._opener.addheaders = [('X-ApiKey',apikey)] self._data = [] self._payload = {}
def init(self): br = mechanize.Browser() br.set_handle_robots(False) self.cj = mechanize.LWPCookieJar() br.set_cookiejar(self.cj) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.open("https://www.tumblr.com/login") br.select_form(nr=0) br['user[email]'] = "" br['user[password]'] = "" url, data, hdrs = br.form.click_request_data() br.open("https://www.tumblr.com/login", data) self.nf = 0 opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(self.cj)) mechanize.install_opener(opener) self._fetch()
def _checkStoredInjections(self): for r in self.results: # At this state injections in Result obj are not # compacted yet so it will only be 1st injected param url, data = r.target.getPayloadedUrl(r.first_param, "") # In case of proxy if self.engine.getOption('http-proxy') is not None: proxy = ProxyHandler({'http': self.engine.getOption('http-proxy')}) opener = build_opener(proxy) install_opener(opener) # Some headers if self.engine.getOption('ua') is not None: if self.engine.getOption('ua') is "RANDOM": headers = {'User-Agent': random.choice(USER_AGENTS)} else: headers = {'User-Agent': self.engine.getOption('ua')} else: headers = {} if self.engine.getOption("cookie") is not None: headers["Cookie"] = self.engine.getOption("cookie") # Build the request req = Request(url, data, headers) try: to = 10 if self.engine.getOption('http-proxy') is None else 20 response = urlopen(req, timeout=to) except HTTPError, e: self._addError(e.code, r.target.getAbsoluteUrl()) continue except URLError, e: self._addError(e.reason, r.target.getAbsoluteUrl()) continue
def customizeUserAgent(): import mechanize cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) # Pretend to be Chrome to avoid getting the mobile site. opener.addheaders = [("User-agent", "Chrome/16.0.912.63")] mechanize.install_opener(opener)
def retrieve_product_links(self): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) url_base = 'http://www.globalmac.cl/' browser = mechanize.Browser() url_extensions = [ ['Distribuidor-Apple-Chile/MacBook-Air', 'Notebook'], ['Distribuidor-Apple-Chile/MacBook-Pro', 'Notebook'], ['Hardware-Mac-PC/Discos-Duros-Notebook-SATA-2.5', 'StorageDrive'], ['Hardware-Mac-PC/Discos-Duros-SATA-3.5', 'StorageDrive'], ['Hardware-Mac-PC/Discos-Duros-SSD-SATA-2.5', 'StorageDrive'], ] product_links = [] for url_extension, ptype in url_extensions: url = url_base + url_extension base_data = browser.open(url).get_data() soup = BeautifulSoup(base_data) for item in soup.findAll('div', 'name'): product_links.append([item.find('a')['href'], ptype]) return product_links
def _performInjections(self, target): # Check every parameter for k, v in target.params.iteritems(): pl = Payload(taint=True) url, data = target.getPayloadedUrl(k, pl.payload) # In case of proxy if self.engine.getOption('http-proxy') is not None: proxy = ProxyHandler({'http': self.engine.getOption('http-proxy')}) opener = build_opener(proxy) install_opener(opener) # Some headers if self.engine.getOption('ua') is not None: if self.engine.getOption('ua') is "RANDOM": headers = {'User-Agent': random.choice(USER_AGENTS)} else: headers = {'User-Agent': self.engine.getOption('ua')} else: headers = {} if self.engine.getOption("cookie") is not None: headers["Cookie"] = self.engine.getOption("cookie") # Build the request req = Request(url, data, headers) try: to = 10 if self.engine.getOption('http-proxy') is None else 20 response = urlopen(req, timeout=to) except HTTPError, e: self._addError(e.code, target.getAbsoluteUrl()) return except URLError, e: self._addError(e.reason, target.getAbsoluteUrl()) return
def slurp_with_login_and_pwd(): import sys import mechanize # sys.path.append('ClientCookie-1.0.3') # from mechanize import ClientCookie # sys.path.append('ClientForm-0.1.17') # import ClientForm # Create special URL opener (for User-Agent) and cookieJar cookieJar = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent","Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) fp = mechanize.urlopen("http://login.yahoo.com") forms = mechanize.ParseResponse(fp) fp.close() # print forms on this page for form in forms: print "***************************" print form form = forms[0] form["login"] = "******" # use your userid form["passwd"] = "password" # use your password fp = mechanize.urlopen(form.click()) fp.close() fp = mechanize.urlopen("https://class.coursera.org/ml-003/lecture/download.mp4?lecture_id=1") # use your group fp.readlines() fp.close()
def readUrl(inUrl): tryCount = 0 while tryCount < 5: # print "Create CookieJar" cookies = mechanize.CookieJar() # print "Build Opener" opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) # print "Add Headers" opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; MyProgram/0.1)"), ("From", "*****@*****.**")] # print "Install Opener" mechanize.install_opener(opener) try: # print "Open URL" response = mechanize.urlopen(inUrl) tryCount = 99 except: tryCount += 1 print "******** Error on urlopen ***********" print "URL: ", inUrl print "Trying Again....", tryCount # print response.read() # html = urllib.urlopen(inUrl).read() # print "Reading Response" html = response.read() # print "Response Read:", html[0:100] root = lxml.html.fromstring(html) # print "Root created: ", root return root
def _retrieve_product(cls, url): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) browser = mechanize.Browser() product_data = browser.open(url).get_data() soup = BeautifulSoup(product_data) product_name = soup.find('h1').string.encode('ascii', 'ignore') product_price = soup.find('span', {'id': 'product_price'}) product_price = Decimal(clean_price_string(product_price.string)) payment_methods = ['cash', 'deposit', 'wire_transfer'] additional_data = soup.find('td', 'descr').findAll('h3') if not additional_data: payment_methods.extend(['debit_card', 'credit_card']) elif additional_data[0].string and 'Contado' not in \ additional_data[0].string: payment_methods.extend(['debit_card', 'credit_card']) prices = {} for p in payment_methods: prices[p] = product_price return [product_name, prices]
def themain(): #browser=mechanize.Browser() #browser.open('http://www.baidu.com') cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener) r = mechanize.urlopen('http://www.baidu.com') cj.save('cookie.txt', ignore_discard=True, ignore_expires=True)
def GetHtml(url): opener = mechanize.build_opener() opener.addheaders = [("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0) Gecko/20100101 Firefox/4.0")] mechanize.install_opener(opener) request = mechanize.urlopen(url) html = request.read() request.close() return html
def __init__(self, username="******", password="******"): self.username = "******" + username self.password = password self.password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() ntlm_auth = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler( self.password_manager) opener = mechanize.build_opener(ntlm_auth) mechanize.install_opener(opener)
def post(self,URL,QueryString,headers=None): opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self.cookieJar)) if headers: opener.addheaders = headers qs = urllib.urlencode(QueryString) response = opener.open(URL,qs) return response
def addAuthentication(self, url, username, password): passman = mechanize.HTTPPasswordMgrWithDefaultRealm() passman.add_password(None, url, username, password) # create the NTLM authentication handler auth_NTLM = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler(passman) # create and install the opener opener = mechanize.build_opener(auth_NTLM) return opener
def resolve(self, url, cookie_jar, user_agent): headers = {'User-agent': user_agent, 'Referer': url} try: cookie_jar.load(ignore_discard=True) except Exception as e: logger.info(e) opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(cookie_jar)) request = mechanize.Request(url) for key in headers: request.add_header(key, headers[key]) try: response = opener.open(request) except mechanize.HTTPError as e: response = e body = response.read() cookie_jar.extract_cookies(response, request) cookie_helper.check_cookies(cookie_jar) parsed_url = urlparse(url) submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, parsed_url.netloc) params = {} try: params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1) params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1) js = self._extract_js(body) except mechanize.HTTPError as e: return None params["jschl_answer"] = str(js + len(parsed_url.netloc)) sParameters = urllib.urlencode(params, True) request = mechanize.Request("%s?%s" % (submit_url, sParameters)) for key in headers: request.add_header(key, headers[key]) sleep(5) try: response = opener.open(request) except mechanize.HTTPError as e: response = e return response
def __init__(self, **kwargs): #self._opener = kwargs.get('opener',None) self._username = unicode(kwargs.get('user', RedditSession._username)) self._passwd = unicode(kwargs.get('passwd', RedditSession._password)) self._cookies = mechanize.CookieJar() self._opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self._cookies)) self._do_login()
def test_robots(self): plain_opener = mechanize.build_opener( mechanize.HTTPRobotRulesProcessor) browser = mechanize.Browser() for opener in plain_opener, browser: r = opener.open(urljoin(self.uri, "robots")) self.assertEqual(r.code, 200) self.assertRaises(mechanize.RobotExclusionError, opener.open, urljoin(self.uri, "norobots"))
def setup_mechanize(): """ Set up user agent for all mechanize calls. """ cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) homepage = "http://github.com/aszlig/picfetcher" opener.addheaders = [("User-agent", "PicFetcher/0.1.0 (+%s)" % homepage)] mechanize.install_opener(opener)
def test_robots(self): plain_opener = mechanize.build_opener(mechanize.HTTPRobotRulesProcessor) browser = mechanize.Browser() for opener in plain_opener, browser: r = opener.open(urljoin(self.uri, "robots")) self.assertEqual(r.code, 200) self.assertRaises( mechanize.RobotExclusionError, opener.open, urljoin(self.uri, "norobots"))
def test_retrieve_to_named_file(self): url = urljoin(self.uri, "/mechanize/") test_filename = os.path.join(self.make_temp_dir(), "python.html") opener = mechanize.build_opener() verif = CallbackVerifier(self) filename, headers = opener.retrieve(url, test_filename, verif.callback) self.assertEqual(filename, test_filename) self._check_retrieve(url, filename, headers) self.assert_(os.path.isfile(filename))
def __callRequest(self): if self.caching and self.cacheTime > 0: sContent = self.readCache(self.getRequestUri()) if sContent: return sContent cookieJar = mechanize.LWPCookieJar(filename=self._cookiePath) try: # TODO ohne try evtl. cookieJar.load(ignore_discard=self.__bIgnoreDiscard, ignore_expires=self.__bIgnoreExpired) except Exception as e: logger.info(e) sParameters = urllib.urlencode(self.__aParameters, True) handlers = [ SmartRedirectHandler, mechanize.HTTPEquivProcessor, mechanize.HTTPRefreshProcessor ] if sys.version_info >= (2, 7, 9) and sys.version_info < (2, 7, 11): handlers.append(newHTTPSHandler) opener = mechanize.build_opener(*handlers) if (len(sParameters) > 0): oRequest = mechanize.Request(self.__sUrl, sParameters) else: oRequest = mechanize.Request(self.__sUrl) for key, value in self.__headerEntries.items(): oRequest.add_header(key, value) cookieJar.add_cookie_header(oRequest) user_agent = self.__headerEntries.get( 'User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3' ) try: oResponse = opener.open(oRequest, timeout=self.requestTimeout) except mechanize.HTTPError, e: if e.code == 503 and e.headers.get("Server") == 'cloudflare-nginx': html = e.read() oResponse = self.__check_protection(html, user_agent, cookieJar) if not oResponse: logger.error("Failed to get CF-Cookie for Url: " + self.__sUrl) return '' elif not self.ignoreErrors: xbmcgui.Dialog().ok('xStream', 'Fehler beim Abrufen der Url:', self.__sUrl, str(e)) logger.error("HTTPError " + str(e) + " Url: " + self.__sUrl) return '' else: oResponse = e
def setUp(self): TestCase.setUp(self) fixture_name = "test_urllib2_localnet_ProxyAuthTests_server" self.register_context_manager(fixture_name, testprogram.ServerCM(self._make_server)) server = self.get_cached_fixture(fixture_name) proxy_url = "http://127.0.0.1:%d" % server.port handler = mechanize.ProxyHandler({"http" : proxy_url}) self.proxy_digest_handler = mechanize.ProxyDigestAuthHandler() self.opener = mechanize.build_opener(handler, self.proxy_digest_handler)
def setUp(self): TestCase.setUp(self) fixture_name = "test_urllib2_localnet_ProxyAuthTests_server" self.register_context_manager(fixture_name, testprogram.ServerCM(self._make_server)) server = self.get_cached_fixture(fixture_name) proxy_url = "http://127.0.0.1:%d" % server.port handler = mechanize.ProxyHandler({"http": proxy_url}) self.proxy_digest_handler = mechanize.ProxyDigestAuthHandler() self.opener = mechanize.build_opener(handler, self.proxy_digest_handler)
def test_retrieve(self): # not passing an explicit filename downloads to a temporary file # using a Request object instead of a URL works url = urljoin(self.uri, "/mechanize/") opener = mechanize.build_opener() verif = CallbackVerifier(self) request = mechanize.Request(url) filename, headers = opener.retrieve(request, reporthook=verif.callback) self.assertEquals(request.visit, False) self._check_retrieve(url, filename, headers) opener.close() # closing the opener removed the temporary file self.failIf(os.path.isfile(filename))
def openUrl(url, cookie=None, login=False): """ Opens a given url through mechanize. If there is no cookie (string path) passed in or if there is a cooke path passed in but the login parameter is False (signifying to open the url with cookie saved in the cookie path), the html from the opened url is returned as a string. If a cookie path is passed in and the login parameter is True, then the Mechanize.Broswer object is returned to perform a yogaglo login through a form submission. """ browser = mechanize.Browser() browser.addheaders = [ ('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:24.0) Gecko/20100101 Firefox/24.0'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Language', 'en-gb,en;q=0.5'), ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'), ('Keep-Alive', '115'), ('Connection', 'keep-alive'), ('Cache-Control', 'max-age=0'), ] #Experimental? # browser.set_handle_gzip(True) browser.set_handle_redirect(True) browser.set_handle_referer(True) browser.set_handle_robots(False) browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time = 1) if not cookie is None: cj = cookielib.LWPCookieJar() browser.set_cookiejar(cj) opener = mechanize.build_opener(HTTPCookieProcessor(cj)) mechanize.install_opener(opener) # trying to login, no cookie, must return browser so it can follow the # login url if login is True: browser.open(url) return browser # can't set to expire, can't read when this particular cookie expires cj.load(cookie , ignore_discard=True) return browser.open(url).read()
def __init__(self): self.cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self.cj)) mechanize.install_opener(opener) self.br = mechanize.Browser() self.br.set_cookiejar(self.cj) self.sessionkey = 'None' self.br.set_header( 'User-Agent', value= 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0' ) # self.br.set_debug_http(True) self.br.set_debug_redirects(True)
def initialize_browser(): """Configurações para contornar os cookies, robots.txt e outros para fingir ser um browser normal.""" cookiejar = cookielib.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookiejar)) mechanize.install_opener(opener) browser = mechanize.Browser() browser.set_handle_robots(False) browser.set_handle_redirect(True) browser.set_cookiejar(cookiejar) browser.set_handle_equiv(True) browser.set_handle_referer(True) browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=2) browser.addheaders = [('User-agent', 'Google Chrome')] return browser, cookiejar
def setUp(self): mechanize._testcase.TestCase.setUp(self) self.test_uri = urljoin(self.uri, "test_fixtures") self.server = self.get_cached_fixture("server") if self.no_proxies: old_opener_m = mechanize._opener._opener old_opener_u = urllib2._opener mechanize.install_opener(mechanize.build_opener( mechanize.ProxyHandler(proxies={}))) urllib2.install_opener(urllib2.build_opener( urllib2.ProxyHandler(proxies={}))) def revert_install(): mechanize.install_opener(old_opener_m) urllib2.install_opener(old_opener_u) self.add_teardown(revert_install)
def __init__(self, feed_id, logging, timeout=5, uptime=False): threading.Thread.__init__(self) self.feed_id = feed_id self.timeout = timeout self.logger = logging.getLogger(__name__) self.opener = mechanize.build_opener() self.opener.addheaders = [('X-ApiKey', xively_key)] self.data = [] self.payload = {} if uptime: self.add_uptime()
def setUp(self): mechanize._testcase.TestCase.setUp(self) self.test_uri = urljoin(self.uri, "test_fixtures") self.server = self.get_cached_fixture("server") if self.no_proxies: old_opener_m = mechanize._opener._opener mechanize.install_opener( mechanize.build_opener(mechanize.ProxyHandler(proxies={}))) install_opener(build_opener(ProxyHandler(proxies={}))) def revert_install(): mechanize.install_opener(old_opener_m) install_opener(None) self.add_teardown(revert_install)
def get_trash_zone(address, zip): #Make cookie jar. See wwwsearch.sourceforge.dat/mechanize/hints.html cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener) #Save cookies cj.save( "/usr/local/django/recyclocity/recyclocity_static/cookies/cookie_jar", ignore_discard=True, ignore_expires=True) #Create a browser browser = mechanize.Browser() #Fill in form browser.open('http://lmt-web.lowermerion.org/cgi-bin/refuse2.plx') browser.form = list(browser.forms())[0] browser.form['askrecycle'] = address browser.form['postcode'] = zip #Submit form browser.submit() #Extract content content = browser.response().read() #Use pattern match to extract fields m = re.search('<b>(Monday|Tuesday|Wednesday|Thursday|Friday)</b>', content) if m: day, = m.groups() #Convert day to number day_number = schedule_helpers.get_day_number(day) else: #Failed return m = re.search('<b>Zone ([1-4])</b>', content) if m: zone, = m.groups() else: #Failed return #Match for both day and zone return day_number, zone
def __init__(self, username, password): mechanize.Browser.__init__(self) cj = mechanize.LWPCookieJar() self.set_cookiejar(cj) self.set_handle_equiv(True) self.set_handle_redirect(True) self.set_handle_referer(True) self.set_handle_robots(False) self.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] self.open(self.base_url) self.username = username self.password = password self.login() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener)
def __init__(self, feed_id, logging, keyfile="xively.key", timeout=5, uptime=False): threading.Thread.__init__(self) # private key stored in a file api_key = open(keyfile).readlines()[0].strip() self.feed_id = feed_id self.timeout = timeout self.logger = logging.getLogger('xively') self.opener = mechanize.build_opener() self.opener.addheaders = [('X-ApiKey', api_key)] self.data = [] self.payload = {} if uptime: self.add_uptime()
def get(self,URL,QueryString=None,headers=None): opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self.cookieJar)) if headers: opener.addheaders = headers content = "" if (QueryString): qs = urllib.urlencode(QueryString) response = opener.open(URL + '?' + qs) content = response else: response = opener.open(URL) content = response return content
def get_trash_zone(address, zip): #Make cookie jar. See wwwsearch.sourceforge.dat/mechanize/hints.html cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener) #Create a browser browser = mechanize.Browser() #User-Agent (this is cheating, ok?) browser.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] #Save cookies cj.save( "/usr/local/django/recyclocity/recyclocity_static/cookies/cookie_jar", ignore_discard=True, ignore_expires=True) #Fill in form #browser.open('http://citymaps.phila.gov/portal/') #browser.select_form(name="form1") #browser.form['txtSearchAddress'] = address #Fill in form #browser.open('https://alpha.phila.gov/property/') #browser.open('http://www.lowermerion.org/cgi-bin/recycle2.plx/') browser.open( 'http://www.lowermerion.org/services/public-works-department/refuse-and-recycling/how-to-determine-your-recycling-collection-day' ) #browser.form = list(browser.forms())[0] #browser.form['askrecycle'] = address #browser.form['postcode'] = zip #Submit form #browser.submit() #Extract content content = browser.response().read() return content
def __callRequest(self): cookieJar = mechanize.LWPCookieJar() try: #TODO ohne try evtl. cookieJar.load(self._cookiePath, self.__bIgnoreDiscard, self.__bIgnoreExpired) except Exception as e: logger.info(e) sParameters = urllib.urlencode(self.__aParameters, True) handlers = [ SmartRedirectHandler, mechanize.HTTPEquivProcessor, mechanize.HTTPRefreshProcessor ] if sys.version_info >= (2, 7, 9) and sys.version_info < (2, 7, 11): handlers.append(newHTTPSHandler) opener = mechanize.build_opener(*handlers) if (len(sParameters) > 0): oRequest = mechanize.Request(self.__sUrl, sParameters) else: oRequest = mechanize.Request(self.__sUrl) for key, value in self.__headerEntries.items(): oRequest.add_header(key, value) cookieJar.add_cookie_header(oRequest) if self.caching and self.cacheTime > 0: sContent = self.readCache(self.getRequestUri()) if sContent: return sContent try: oResponse = opener.open(oRequest, timeout=self.requestTimeout) except mechanize.HTTPError, e: if e.code == 503 and e.headers.get("Server") == 'cloudflare-nginx': oResponse, cookieJar = cCFScrape().resolve( oRequest, e, cookieJar) elif not self.ignoreErrors: xbmcgui.Dialog().ok('xStream', 'Fehler beim Abrufen der Url:', self.__sUrl, str(e)) logger.error("HTTPError " + str(e) + " Url: " + self.__sUrl) return '' else: oResponse = e
def __init__(self, username, password): mechanize.Browser.__init__(self) cj = mechanize.LWPCookieJar() self.set_cookiejar(cj) self.set_handle_equiv(True) self.set_handle_redirect(True) self.set_handle_referer(True) self.set_handle_robots(False) self.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] self.open(self.base_url) self.username = username self.password = password self.login() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener)
def go(): ''' Main procedure of the scraper. Creates a browser, load the list of tasks and execute them ''' try: # Prepare the browser cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) mechanize.install_opener(opener) br = mechanize.Browser() br.set_handle_robots(False) br.set_handle_refresh(False) br.set_handle_referer(False) br.open("http://www.infogreffe.fr/infogreffe/process.do") # Get the list of tasks tasks = load_task_queue() if len(tasks) == 0: # If there is no task to execute, init/reset the table init_task_queue() tasks = load_task_queue() for task in tasks: try: # Execute the task results = get_companies(br, task['name'], task['dept']) # If we hit the soft limit, add more refined searches to the queue if results == 100: print "Limit reached for %s in %s, adding new tasks" % ( task['name'], task['dept']) expand_task_queue(task['name'], task['dept']) # Mark the task as done mark_task_done(task['name'], task['dept'], results) except Exception as detail: # We may get an exception for using too much CPU time. print "Exception raised", detail except Exception as detail: # If we can't open the browser, just skip running the scraper print "Failed starting browser ", detail
def resolve(self, req, error, cookieJar): sleep(5) useragent = req.headers.get('User-agent') body = error.read() parsed_url = urlparse(error.url) submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, parsed_url.netloc) params = {} try: params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1) params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1) js = self._extract_js(body) except: raise params["jschl_answer"] = str(js + len(parsed_url.netloc)) opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(cookieJar)) sParameters = urllib.urlencode(params, True) request = mechanize.Request("%s?%s" % (submit_url, sParameters)) request.add_header('Referer', error.url) request.add_header('User-agent', useragent) try: response = opener.open(request) except: raise return response, cookieJar
def logIn(self): """ Logs in to private archives using the supplied email and password. Stores the cookie so we can continue to get subsequent pages. """ cookieJar = mechanize.CookieJar() opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) self.message('Logging in to ' + self.list_url) fp = mechanize.urlopen(self.list_url) forms = ClientForm.ParseResponse(fp, backwards_compat=False) fp.close() form = forms[0] form['username'] = self.username form['password'] = self.password fp = mechanize.urlopen(form.click()) fp.close()