def _journey(self, func_name, *args, **kwargs): """ 'func_name' should be one of 'open', 'reload', 'back', or 'follow_link'. journey then runs that function with the given arguments and turns the results into a nice friendly standard ResultWrapper object, which is stored as 'self.result'. All exceptions other than HTTPError are unhandled. (Idea stolen straight from PBP.) """ self.last_submit_button = None if func_name == 'open': url = args[0] elif func_name == 'follow_link': # Try to find the link first url = self.find_link(args[0]) if url.find('://') == -1: url = urlparse.urljoin(self.get_url(), url) elif func_name == 'reload': url = self.get_url() elif func_name == 'back': try: self.result = self._history.pop() return except IndexError: raise TwillException if url in self._auth.keys(): auth = self._auth[url] else: auth = None r = self._session.get(url, auth = auth) if _follow_equiv_refresh(): r = self._follow_redirections(r, self._session) if func_name in ['follow_link', 'open']: # If we're really reloading and just didn't say so, don't store if self.result is not None and self.result.get_url() != r.url: self._history.append(self.result) self.result = ResultWrapper(r)
def submit(self, fieldname=None): """ Submit the currently clicked form using the given field. """ if fieldname is not None: fieldname = str(fieldname) if len(self.get_all_forms()) == 0: raise TwillException("no forms on this page!") ctl = None form = self._form if form is None: forms = [ i for i in self.get_all_forms() ] if len(forms) == 1: form = forms[0] else: raise TwillException("""\ more than one form; you must select one (use 'fv') before submitting\ """) if form.action is None: form.action = self.get_url() # no fieldname? see if we can use the last submit button clicked... if fieldname is None: if self.last_submit_button is not None: ctl = self.last_submit_button else: # get first submit button in form. submits = [ c for c in form.inputs if hasattr(c, 'type') and (c.type == 'submit' or c.type == 'image')] if len(submits) != 0: ctl = submits[0] else: # fieldname given; find it. ctl = self.get_form_field(form, fieldname) # # now set up the submission by building the request object that # will be sent in the form submission. # if ctl is not None: # submit w/button print>>OUT, """\ Note: submit is using submit button: name="%s", value="%s" """ % (ctl.get("name"), ctl.value) if hasattr(ctl, 'type') and ctl.type == 'image': pass else: # submit w/o submit button. pass # @BRT: For now, the referrer is always the current page # @CTB this seems like an issue for further work. headers = {'referer' : self.get_url()} # # add referer information. this may require upgrading the # request object to have an 'add_unredirected_header' function. # # # now actually GO. # payload = list(form.form_values()) if ctl is not None and ctl.get("name") is not None: payload.append( (ctl.get("name"), ctl.value) ) if form.method == 'POST': if len(self._formFiles) != 0: r = self._session.post( form.action, data=payload, files=self._formFiles, headers=headers ) else: r = self._session.post( form.action, data=payload, headers=headers ) else: r = self._session.get(form.action, data=payload, headers=headers) self._formFiles.clear() self._history.append(self.result) self.result = ResultWrapper(r)
class TwillBrowser(object): """A simple, stateful browser""" def __init__(self): # # create special link/forms parsing code to run tidy on HTML first. # # WSGI Intercept # Taken from # https://code.google.com/p/wsgi-intercept/issues/detail?id=23 # with slight modification import wsgi_intercept from requests.packages.urllib3 import connectionpool as cpl cpl.HTTPConnectionPool.old_http = cpl.HTTPConnectionPool.ConnectionCls cpl.HTTPConnectionPool.ConnectionCls = wsgi_intercept.WSGI_HTTPConnection wsgi_intercept.wsgi_fake_socket.settimeout = lambda self, timeout: None self.result = None self.last_submit_button = None # Session stores cookies self._session = requests.Session() self._session.headers.update({"Accept" : "text/html, */*"}) # An lxml FormElement, none until a form is selected # replaces self._browser.form from mechanize self._form = None self._formFiles = {} # A dict of HTTPBasicAuth from requests, keyed off URL self._auth = {} # callables to be called after each page load. self._post_load_hooks = [] self._history = [] def _set_creds(self, creds): self._auth[creds[0]] = requests.auth.HTTPBasicAuth(*creds[1]) def _get_creds(self): return self._auth def go(self, url): """ Visit given URL. """ try_urls = [url, ] # if this is an absolute URL that is just missing the 'http://' at # the beginning, try fixing that. if url.find('://') == -1: full_url = 'http://%s' % (url,) # mimic browser behavior try_urls.append(full_url) # if this is a '?' or '/' URL, then assume that we want to tack it onto # the end of the current URL. try_urls.append(urlparse.urljoin(self.get_url(), url)) success = False for u in try_urls: try: self._journey('open', u) success = True break except (IOError, ConnectionError, InvalidSchema): # @CTB test this! pass if success: print>>OUT, '==> at', self.get_url() else: raise TwillException("cannot go to '%s'" % (url,)) def reload(self): """ Tell the browser to reload the current page. """ self._journey('reload') print>>OUT, '==> reloaded' def back(self): """ Return to previous page, if possible. """ try: self._journey('back') print>>OUT, '==> back to', self.get_url() except TwillException: print>>OUT, '==> back at empty page.' def get_code(self): """ Get the HTTP status code received for the current page. """ if self.result is not None: return self.result.get_http_code() return None def get_html(self): """ Get the HTML for the current page. """ if self.result is not None: return self.result.get_page() return None def get_title(self): if self.result is not None: return self.result.get_title() raise TwillException("Error: Getting title with no page") def get_url(self): """ Get the URL of the current page. """ if self.result is not None: return self.result.get_url() return None def find_link(self, pattern): """ Find the first link with a URL, link text, or name matching the given pattern. """ if self.result is not None: return self.result.find_link(pattern) return '' def follow_link(self, link): """ Follow the given link. """ self._journey('follow_link', link) print>>OUT, '==> at', self.get_url() def set_agent_string(self, agent): """ Set the agent string to the given value. """ self._session.headers.update({'User-agent' : agent}) return def showforms(self): """ Pretty-print all of the forms. Include the global form (form elements outside of <form> pairs) as forms[0] iff present. """ forms = self.get_all_forms() for n, f in enumerate(forms): print_form(n, f, OUT) def showlinks(self): """ Pretty-print all of the links. """ links = self.get_all_links() for n,link in enumerate(links): print>>OUT, "%d. %s ==> %s" % (n, link[0], link[1],) print>>OUT, '' def showhistory(self): """ Pretty-print the history of links visited. """ print>>OUT, '' print>>OUT, 'History: (%d pages total) ' % (len(self._history)) n = 1 for page in self._history: print>>OUT, "\t%d. %s" % (n, page.get_url()) n += 1 print>>OUT, '' def get_all_links(self): """ Return a list of all of the links on the page """ if self.result is not None: return self.result.get_links() return [] def get_all_forms(self): """ Return a list of all of the forms, with global_form at index 0 iff present. """ if self.result is not None: return self.result.get_forms() return [] def get_form(self, formname): """ Return the first form that matches 'formname'. """ if self.result is not None: return self.result.get_form(formname) return None def get_form_field(self, form, fieldname): """ Return the control that matches 'fieldname'. Must be a *unique* regexp/exact string match. """ if fieldname in form.fields.keys(): controls = [f for f in form.inputs if f.get("name") == fieldname \ and hasattr(f, 'type') and f.type == 'checkbox'] if len(controls) > 1: return html.CheckboxGroup(controls) fieldname = str(fieldname) found = None found_multiple = False matches = [ c for c in form.inputs if c.get("id") == fieldname ] # test exact match. if matches: if unique_match(matches): found = matches[0] else: found_multiple = True # record for error reporting. matches = [ c for c in form.inputs if str(c.name) == fieldname ] # test exact match. if matches: if unique_match(matches): found = matches[0] else: found_multiple = True # record for error reporting. # test index. if found is None: # try num clickies = [c for c in form.inputs] try: fieldnum = int(fieldname) - 1 found = clickies[fieldnum] except ValueError: # int() failed pass except IndexError: # fieldnum was incorrect pass # test regexp match if found is None: regexp = re.compile(fieldname) matches = [ ctl for ctl in form.inputs \ if regexp.search(str(ctl.get("name"))) ] if matches: if unique_match(matches): found = matches[0] else: found_multiple = True # record for error if found is None: clickies = [ c for c in form.inputs if c.value == fieldname] if clickies: if len(clickies) == 1: found = clickies[0] else: found_multiple = True # record for error # error out? if found is None: if not found_multiple: raise TwillException('no field matches "%s"' % (fieldname,)) else: raise TwillException('multiple matches to "%s"' % (fieldname,)) return found def clicked(self, form, control): """ Record a 'click' in a specific form. """ if self._form != form: # construct a function to choose a particular form; select_form # can use this to pick out a precise form. self._form = form self.last_submit_button = None # record the last submit button clicked. if hasattr(control, 'type') and \ (control.type == 'submit' or control.type == 'image'): self.last_submit_button = control def submit(self, fieldname=None): """ Submit the currently clicked form using the given field. """ if fieldname is not None: fieldname = str(fieldname) if len(self.get_all_forms()) == 0: raise TwillException("no forms on this page!") ctl = None form = self._form if form is None: forms = [ i for i in self.get_all_forms() ] if len(forms) == 1: form = forms[0] else: raise TwillException("""\ more than one form; you must select one (use 'fv') before submitting\ """) if form.action is None: form.action = self.get_url() # no fieldname? see if we can use the last submit button clicked... if fieldname is None: if self.last_submit_button is not None: ctl = self.last_submit_button else: # get first submit button in form. submits = [ c for c in form.inputs if hasattr(c, 'type') and (c.type == 'submit' or c.type == 'image')] if len(submits) != 0: ctl = submits[0] else: # fieldname given; find it. ctl = self.get_form_field(form, fieldname) # # now set up the submission by building the request object that # will be sent in the form submission. # if ctl is not None: # submit w/button print>>OUT, """\ Note: submit is using submit button: name="%s", value="%s" """ % (ctl.get("name"), ctl.value) if hasattr(ctl, 'type') and ctl.type == 'image': pass else: # submit w/o submit button. pass # @BRT: For now, the referrer is always the current page # @CTB this seems like an issue for further work. headers = {'referer' : self.get_url()} # # add referer information. this may require upgrading the # request object to have an 'add_unredirected_header' function. # # # now actually GO. # payload = list(form.form_values()) if ctl is not None and ctl.get("name") is not None: payload.append( (ctl.get("name"), ctl.value) ) if form.method == 'POST': if len(self._formFiles) != 0: r = self._session.post( form.action, data=payload, files=self._formFiles, headers=headers ) else: r = self._session.post( form.action, data=payload, headers=headers ) else: r = self._session.get(form.action, data=payload, headers=headers) self._formFiles.clear() self._history.append(self.result) self.result = ResultWrapper(r) def save_cookies(self, filename): """ Save cookies into the given file. """ with open(filename, 'wb') as f: pickle.dump(self._session.cookies, f) def load_cookies(self, filename): """ Load cookies from the given file. """ with open(filename, 'rb') as f: self._session.cookies = pickle.load(f) def clear_cookies(self): """ Delete all of the cookies. """ self._session.cookies.clear() def show_cookies(self): """ Pretty-print all of the cookies. """ c = requests.utils.dict_from_cookiejar(self._session.cookies) print>>OUT, 'There are %d cookie(s) in the cookiejar.\n' % (len(c)) if len(self._session.cookies): for cookie in self._session.cookies: print>>OUT, '\t', cookie print>>OUT, '' # BRT: Added to test for meta redirection # Shamelessly stolen from # http://stackoverflow.com/questions/2318446/how-to-follow-meta-refreshes-in-python # Took some modification to get it working, though # Original post notes that this doesn't check circular redirect # Is this something we're concerned with? def _test_for_meta_redirections(self, r): """ Checks a document for meta redirection """ html_tree = html.fromstring(r.text) attr = html_tree.xpath( "//meta[translate(@http-equiv, 'REFSH', 'refsh') = 'refresh']/@content" ) if len(attr) > 0: wait, text = attr[0].split(";") # @BRT: Strip surrounding quotes and ws; less brute force method? # Other chars that need to be dealt with? text = text.strip() text = text.strip('\'"') if text.lower().startswith("url="): url = text[4:] if not url.startswith('http'): # Relative URL, adapt url = urlparse.urljoin(r.url, url) return True, url return False, None # BRT: Added to test for meta redirection # Shamelessly stolen from the same link as _test_for_meta_redirections def _follow_redirections(self, r, s): """ Recursive function that follows meta refresh redirections if they exist. """ redirected, url = self._test_for_meta_redirections(r) if redirected: r = self._follow_redirections(s.get(url), s) return r def _journey(self, func_name, *args, **kwargs): """ 'func_name' should be one of 'open', 'reload', 'back', or 'follow_link'. journey then runs that function with the given arguments and turns the results into a nice friendly standard ResultWrapper object, which is stored as 'self.result'. All exceptions other than HTTPError are unhandled. (Idea stolen straight from PBP.) """ self.last_submit_button = None if func_name == 'open': url = args[0] elif func_name == 'follow_link': # Try to find the link first url = self.find_link(args[0]) if url.find('://') == -1: url = urlparse.urljoin(self.get_url(), url) elif func_name == 'reload': url = self.get_url() elif func_name == 'back': try: self.result = self._history.pop() return except IndexError: raise TwillException if url in self._auth.keys(): auth = self._auth[url] else: auth = None r = self._session.get(url, auth = auth) if _follow_equiv_refresh(): r = self._follow_redirections(r, self._session) if func_name in ['follow_link', 'open']: # If we're really reloading and just didn't say so, don't store if self.result is not None and self.result.get_url() != r.url: self._history.append(self.result) self.result = ResultWrapper(r)
class TwillBrowser(object): """ Wrap mechanize behavior in a simple stateful way. Public variables: * result -- mechanize-style 'result' object. """ def __init__(self): # # create special link/forms parsing code to run tidy on HTML first. # factory = ConfigurableParsingFactory() # # Create the mechanize browser. # b = PatchedMechanizeBrowser(history=HistoryStack(), factory=factory) self._browser = b self.result = None self.last_submit_button = None # # create & set a cookie jar. # policy = mechanize.DefaultCookiePolicy(rfc2965=True) cj = mechanize.LWPCookieJar(policy=policy) self._browser.set_cookiejar(cj) self.cj = cj # Ask for MIME type 'text/html' by preference. self._browser.addheaders = [("Accept", "text/html; */*")] # ignore robots.txt self._browser.set_handle_robots(None) # create an HTTP auth handler self.creds = mechanize.HTTPPasswordMgr() # do handle HTTP-EQUIV properly. self._browser.set_handle_equiv(True) # callables to be called after each page load. self._post_load_hooks = [] ### get/set HTTP authentication stuff. def _set_creds(self, creds): self._creds = creds self._browser.set_password_manager(creds) def _get_creds(self): return self._creds creds = property(_get_creds, _set_creds) def go(self, url): """ Visit given URL. """ try_urls = [ url, ] # if this is an absolute URL that is just missing the 'http://' at # the beginning, try fixing that. if url.find('://') == -1: full_url = 'http://%s' % (url,) # mimic browser behavior try_urls.append(full_url) # if this is a '?' URL, then assume that we want to tack it onto # the end of the current URL. if url.startswith('?'): current_url = self.get_url() current_url = current_url.split('?')[0] try_urls = [ current_url + url, ] success = False for u in try_urls: try: self._journey('open', u) success = True break except IOError: # @CTB test this! pass if success: print>>OUT, '==> at', self.get_url() else: raise BrowserStateError("cannot go to '%s'" % (url,)) def reload(self): """ Tell the browser to reload the current page. """ self._journey('reload') print>>OUT, '==> reloaded' def back(self): """ Return to previous page, if possible. """ try: self._journey('back') print>>OUT, '==> back to', self.get_url() except BrowserStateError: print>>OUT, '==> back at empty page.' def get_code(self): """ Get the HTTP status code received for the current page. """ if self.result: return self.result.get_http_code() return None def get_html(self): """ Get the HTML for the current page. """ if self.result: return self.result.get_page() return None def get_headers(self): """ Get the headers for the current page. """ if self.result: return self.result.get_headers() return None def get_title(self): """ Get content of the HTML title element for the current page. """ return self._browser.title() def get_url(self): """ Get the URL of the current page. """ if self.result: return self.result.get_url() return None def find_link(self, pattern): """ Find the first link with a URL, link text, or name matching the given pattern. """ # # first, try to find a link matching that regexp. # try: l = self._browser.find_link(url_regex=pattern) except LinkNotFoundError: # # then, look for a text match. # try: l = self._browser.find_link(text_regex=pattern) except LinkNotFoundError: # # finally, look for a name match. # try: l = self._browser.find_link(name_regex=pattern) except LinkNotFoundError: l = None return l def follow_link(self, link): """ Follow the given link. """ self._journey('follow_link', link) print>>OUT, '==> at', self.get_url() def set_agent_string(self, agent): """ Set the agent string to the given value. """ for i in xrange(len(self._browser.addheaders)): if self._browser.addheaders[i][0] == "User-agent": del self._browser.addheaders[i] break self._browser.addheaders += [("User-agent", agent)] def showforms(self): """ Pretty-print all of the forms. Include the global form (form elements outside of <form> pairs) as forms[0] iff present. """ forms = self.get_all_forms() for n, f in enumerate(forms): print_form(n, f, OUT) def showlinks(self): """ Pretty-print all of the links. """ print>>OUT, 'Links:\n' for n, link in enumerate(self._browser.links()): print>>OUT, "%d. %s ==> %s" % (n, link.text, link.url,) print>>OUT, '' def showhistory(self): """ Pretty-print the history of links visited. """ print>>OUT, '' print>>OUT, 'History: (%d pages total) ' % (len(self._browser._history)) n = 1 for (req, resp) in self._browser._history: if req and resp: # only print those that back() will go print>>OUT, "\t%d. %s" % (n, resp.geturl()) n += 1 print>>OUT, '' def get_all_forms(self): """ Return a list of all of the forms, with global_form at index 0 iff present. """ global_form = self._browser.global_form() forms = list(self._browser.forms()) if global_form.controls: forms.insert(0, global_form) return forms def get_form(self, formname): """ Return the first form that matches 'formname'. """ formname = str(formname) forms = self.get_all_forms() # first try ID for f in forms: id = f.attrs.get("id") if id and str(id) == formname: return f # next try regexps regexp = re.compile(formname) for f in forms: if f.name and regexp.search(f.name): return f # ok, try number try: formnum = int(formname) if formnum >= 1 and formnum <= len(forms): return forms[formnum - 1] except ValueError: # int() failed pass except IndexError: # formnum was incorrect pass return None def get_form_field(self, form, fieldname): """ Return the control that matches 'fieldname'. Must be a *unique* regexp/exact string match. """ fieldname = str(fieldname) found = None found_multiple = False matches = [ c for c in form.controls if str(c.id) == fieldname ] # test exact match. if matches: if unique_match(matches): found = matches[0] else: found_multiple = True # record for error reporting. matches = [ c for c in form.controls if str(c.name) == fieldname ] # test exact match. if matches: if unique_match(matches): found = matches[0] else: found_multiple = True # record for error reporting. # test index. if found is None: # try num clickies = [c for c in form.controls] try: fieldnum = int(fieldname) - 1 found = clickies[fieldnum] except ValueError: # int() failed pass except IndexError: # fieldnum was incorrect pass # test regexp match if found is None: regexp = re.compile(fieldname) matches = [ ctl for ctl in form.controls \ if regexp.search(str(ctl.name)) ] if matches: if unique_match(matches): found = matches[0] else: found_multiple = True # record for error if found is None: # try value, for readonly controls like submit keys clickies = [ c for c in form.controls if c.value == fieldname \ and c.readonly ] if clickies: if len(clickies) == 1: found = clickies[0] else: found_multiple = True # record for error # error out? if found is None: if not found_multiple: raise TwillException('no field matches "%s"' % (fieldname,)) else: raise TwillException('multiple matches to "%s"' % (fieldname,)) return found def clicked(self, form, control): """ Record a 'click' in a specific form. """ if self._browser.form != form: # construct a function to choose a particular form; select_form # can use this to pick out a precise form. def choose_this_form(test_form, this_form=form): if test_form is this_form: return True return False self._browser.select_form(predicate=choose_this_form) assert self._browser.form == form self.last_submit_button = None # record the last submit button clicked. if isinstance(control, ClientForm.SubmitControl): self.last_submit_button = control def submit(self, fieldname=None): """ Submit the currently clicked form using the given field. """ if fieldname is not None: fieldname = str(fieldname) if not self.get_all_forms(): raise TwillException("no forms on this page!") ctl = None form = self._browser.form if form is None: forms = [ i for i in self.get_all_forms() ] if len(forms) == 1: form = forms[0] else: raise TwillException("""\ more than one form; you must select one (use 'fv') before submitting\ """) # no fieldname? see if we can use the last submit button clicked... if not fieldname: if self.last_submit_button: ctl = self.last_submit_button else: # get first submit button in form. submits = [ c for c in form.controls \ if isinstance(c, ClientForm.SubmitControl) ] if len(submits): ctl = submits[0] else: # fieldname given; find it. ctl = self.get_form_field(form, fieldname) # # now set up the submission by building the request object that # will be sent in the form submission. # if ctl: # submit w/button print>>OUT, """\ Note: submit is using submit button: name="%s", value="%s" """ % (ctl.name, ctl.value) if isinstance(ctl, ClientForm.ImageControl): request = ctl._click(form, (1,1), "", mechanize.Request) else: request = ctl._click(form, True, "", mechanize.Request) else: # submit w/o submit button. request = form._click(None, None, None, None, 0, None, "", mechanize.Request) # # add referer information. this may require upgrading the # request object to have an 'add_unredirected_header' function. # upgrade = self._browser._ua_handlers.get('_http_request_upgrade') if upgrade: request = upgrade.http_request(request) request = self._browser._add_referer_header(request) # # now actually GO. # self._journey('open', request) def save_cookies(self, filename): """ Save cookies into the given file. """ self.cj.save(filename, ignore_discard=True, ignore_expires=True) def load_cookies(self, filename): """ Load cookies from the given file. """ self.cj.load(filename, ignore_discard=True, ignore_expires=True) def clear_cookies(self): """ Delete all of the cookies. """ self.cj.clear() def show_cookies(self): """ Pretty-print all of the cookies. """ print>>OUT, ''' There are %d cookie(s) in the cookiejar. ''' % (len(self.cj,)) if len(self.cj): for cookie in self.cj: print>>OUT, '\t', cookie print>>OUT, '' #### private functions. def _journey(self, func_name, *args, **kwargs): """ 'func_name' should be the name of a mechanize method that either returns a 'result' object or raises a HTTPError, e.g. one of 'open', 'reload', 'back', or 'follow_link'. journey then runs that function with the given arguments and turns the results into a nice friendly standard ResultWrapper object, which is stored as 'self.result'. All exceptions other than HTTPError are unhandled. (Idea stolen straight from PBP.) """ # reset self.last_submit_button = None self.result = None func = getattr(self._browser, func_name) try: r = func(*args, **kwargs) except mechanize.HTTPError, e: r = e # seek back to 0 if a seek() function is present. seek_fn = getattr(r, 'seek', None) if seek_fn: seek_fn(0) # some URLs, like 'file:' URLs, don't have return codes. In this # case, assume success (code=200) if no such attribute. code = getattr(r, 'code', 200) ## special case refresh loops!? if code == 'refresh': raise TwillException("""\ infinite refresh loop discovered; aborting. Try turning off acknowledge_equiv_refresh...""") self.result = ResultWrapper(code, r.geturl(), r.read(), r.info()) # # Now call all of the post load hooks with the function name. # for callable in self._post_load_hooks: callable(func_name, *args, **kwargs)
class TwillBrowser(object): """A simple, stateful browser""" def __init__(self): # # create special link/forms parsing code to run tidy on HTML first. # # WSGI Intercept # Taken from # https://code.google.com/p/wsgi-intercept/issues/detail?id=23 # with slight modification import wsgi_intercept from requests.packages.urllib3 import connectionpool as cpl cpl.HTTPConnectionPool.old_http = cpl.HTTPConnectionPool.ConnectionCls cpl.HTTPConnectionPool.ConnectionCls = wsgi_intercept.WSGI_HTTPConnection wsgi_intercept.wsgi_fake_socket.settimeout = lambda self, timeout: None self.result = None self.last_submit_button = None # Session stores cookies self._session = requests.Session() self._session.headers.update({"Accept" : "text/html; */*"}) # An lxml FormElement, none until a form is selected # replaces self._browser.form from mechanize self._form = None self._formFiles = {} # A dict of HTTPBasicAuth from requests, keyed off URL self._auth = {} # callables to be called after each page load. self._post_load_hooks = [] self._history = [] def _set_creds(self, creds): self._auth[creds[0]] = requests.auth.HTTPBasicAuth(*creds[1]) def _get_creds(self): return self._auth def go(self, url): """ Visit given URL. """ try_urls = [url, ] # if this is an absolute URL that is just missing the 'http://' at # the beginning, try fixing that. if url.find('://') == -1: full_url = 'http://%s' % (url,) # mimic browser behavior try_urls.append(full_url) # if this is a '?' or '/' URL, then assume that we want to tack it onto # the end of the current URL. try_urls.append(urlparse.urljoin(self.get_url(), url)) success = False for u in try_urls: try: self._journey('open', u) success = True break except (IOError, ConnectionError, InvalidSchema): # @CTB test this! pass if success: print>>OUT, '==> at', self.get_url() else: raise TwillException("cannot go to '%s'" % (url,)) def reload(self): """ Tell the browser to reload the current page. """ self._journey('reload') print>>OUT, '==> reloaded' def back(self): """ Return to previous page, if possible. """ try: self._journey('back') print>>OUT, '==> back to', self.get_url() except TwillException: print>>OUT, '==> back at empty page.' def get_code(self): """ Get the HTTP status code received for the current page. """ if self.result is not None: return self.result.get_http_code() return None def get_html(self): """ Get the HTML for the current page. """ if self.result is not None: return self.result.get_page() return None def get_title(self): if self.result is not None: return self.result.get_title() raise TwillException("Error: Getting title with no page") def get_url(self): """ Get the URL of the current page. """ if self.result is not None: return self.result.get_url() return None def find_link(self, pattern): """ Find the first link with a URL, link text, or name matching the given pattern. """ if self.result is not None: return self.result.find_link(pattern) return '' def follow_link(self, link): """ Follow the given link. """ self._journey('follow_link', link) print>>OUT, '==> at', self.get_url() def set_agent_string(self, agent): """ Set the agent string to the given value. """ self._session.headers.update({'User-agent' : agent}) return def showforms(self): """ Pretty-print all of the forms. Include the global form (form elements outside of <form> pairs) as forms[0] iff present. """ forms = self.get_all_forms() for n, f in enumerate(forms): print_form(n, f, OUT) def showlinks(self): """ Pretty-print all of the links. """ links = self.get_all_links() for n,link in enumerate(links): print>>OUT, "%d. %s ==> %s" % (n, link[0], link[1],) print>>OUT, '' def showhistory(self): """ Pretty-print the history of links visited. """ print>>OUT, '' print>>OUT, 'History: (%d pages total) ' % (len(self._history)) n = 1 for page in self._history: print>>OUT, "\t%d. %s" % (n, page.get_url()) n += 1 print>>OUT, '' def get_all_links(self): """ Return a list of all of the links on the page """ if self.result is not None: return self.result.get_links() return [] def get_all_forms(self): """ Return a list of all of the forms, with global_form at index 0 iff present. """ if self.result is not None: return self.result.get_forms() return [] def get_form(self, formname): """ Return the first form that matches 'formname'. """ if self.result is not None: return self.result.get_form(formname) return None def get_form_field(self, form, fieldname): """ Return the control that matches 'fieldname'. Must be a *unique* regexp/exact string match. """ if fieldname in form.fields.keys(): controls = [f for f in form.inputs if f.get("name") == fieldname \ and hasattr(f, 'type') and f.type == 'checkbox'] if len(controls) > 1: return html.CheckboxGroup(controls) fieldname = str(fieldname) found = None found_multiple = False matches = [ c for c in form.inputs if c.get("id") == fieldname ] # test exact match. if matches: if unique_match(matches): found = matches[0] else: found_multiple = True # record for error reporting. matches = [ c for c in form.inputs if str(c.name) == fieldname ] # test exact match. if matches: if unique_match(matches): found = matches[0] else: found_multiple = True # record for error reporting. # test index. if found is None: # try num clickies = [c for c in form.inputs] try: fieldnum = int(fieldname) - 1 found = clickies[fieldnum] except ValueError: # int() failed pass except IndexError: # fieldnum was incorrect pass # test regexp match if found is None: regexp = re.compile(fieldname) matches = [ ctl for ctl in form.inputs \ if regexp.search(str(ctl.get("name"))) ] if matches: if unique_match(matches): found = matches[0] else: found_multiple = True # record for error if found is None: clickies = [ c for c in form.inputs if c.value == fieldname] if clickies: if len(clickies) == 1: found = clickies[0] else: found_multiple = True # record for error # error out? if found is None: if not found_multiple: raise TwillException('no field matches "%s"' % (fieldname,)) else: raise TwillException('multiple matches to "%s"' % (fieldname,)) return found def clicked(self, form, control): """ Record a 'click' in a specific form. """ if self._form != form: # construct a function to choose a particular form; select_form # can use this to pick out a precise form. self._form = form self.last_submit_button = None # record the last submit button clicked. if hasattr(control, 'type') and \ (control.type == 'submit' or control.type == 'image'): self.last_submit_button = control def submit(self, fieldname=None): """ Submit the currently clicked form using the given field. """ if fieldname is not None: fieldname = str(fieldname) if len(self.get_all_forms()) == 0: raise TwillException("no forms on this page!") ctl = None form = self._form if form is None: forms = [ i for i in self.get_all_forms() ] if len(forms) == 1: form = forms[0] else: raise TwillException("""\ more than one form; you must select one (use 'fv') before submitting\ """) if form.action is None: form.action = self.get_url() # no fieldname? see if we can use the last submit button clicked... if fieldname is None: if self.last_submit_button is not None: ctl = self.last_submit_button else: # get first submit button in form. submits = [ c for c in form.inputs if hasattr(c, 'type') and (c.type == 'submit' or c.type == 'image')] if len(submits) != 0: ctl = submits[0] else: # fieldname given; find it. ctl = self.get_form_field(form, fieldname) # # now set up the submission by building the request object that # will be sent in the form submission. # if ctl is not None: # submit w/button print>>OUT, """\ Note: submit is using submit button: name="%s", value="%s" """ % (ctl.get("name"), ctl.value) if hasattr(ctl, 'type') and ctl.type == 'image': pass else: # submit w/o submit button. pass # @BRT: For now, the referrer is always the current page # @CTB this seems like an issue for further work. headers = {'referer' : self.get_url()} # # add referer information. this may require upgrading the # request object to have an 'add_unredirected_header' function. # # # now actually GO. # payload = list(form.form_values()) if ctl is not None and ctl.get("name") is not None: payload.append( (ctl.get("name"), ctl.value) ) if form.method == 'POST': if len(self._formFiles) != 0: r = self._session.post( form.action, data=payload, files=self._formFiles, headers=headers ) else: r = self._session.post( form.action, data=payload, headers=headers ) else: r = self._session.get(form.action, data=payload, headers=headers) self._formFiles.clear() self._history.append(self.result) self.result = ResultWrapper(r) def save_cookies(self, filename): """ Save cookies into the given file. """ with open(filename, 'wb') as f: pickle.dump(self._session.cookies, f) def load_cookies(self, filename): """ Load cookies from the given file. """ with open(filename, 'rb') as f: self._session.cookies = pickle.load(f) def clear_cookies(self): """ Delete all of the cookies. """ self._session.cookies.clear() def show_cookies(self): """ Pretty-print all of the cookies. """ c = requests.utils.dict_from_cookiejar(self._session.cookies) print>>OUT, 'There are %d cookie(s) in the cookiejar.\n' % (len(c)) if len(self._session.cookies): for cookie in self._session.cookies: print>>OUT, '\t', cookie print>>OUT, '' # BRT: Added to test for meta redirection # Shamelessly stolen from # http://stackoverflow.com/questions/2318446/how-to-follow-meta-refreshes-in-python # Took some modification to get it working, though # Original post notes that this doesn't check circular redirect # Is this something we're concerned with? def _test_for_meta_redirections(self, r): """ Checks a document for meta redirection """ html_tree = html.fromstring(r.text) attr = html_tree.xpath( "//meta[translate(@http-equiv, 'REFSH', 'refsh') = 'refresh']/@content" ) if len(attr) > 0: wait, text = attr[0].split(";") # @BRT: Strip surrounding quotes and ws; less brute force method? # Other chars that need to be dealt with? text = text.strip() text = text.strip('\'"') if text.lower().startswith("url="): url = text[4:] if not url.startswith('http'): # Relative URL, adapt url = urlparse.urljoin(r.url, url) return True, url return False, None # BRT: Added to test for meta redirection # Shamelessly stolen from the same link as _test_for_meta_redirections def _follow_redirections(self, r, s): """ Recursive function that follows meta refresh redirections if they exist. """ redirected, url = self._test_for_meta_redirections(r) if redirected: r = self._follow_redirections(s.get(url), s) return r def _journey(self, func_name, *args, **kwargs): """ 'func_name' should be one of 'open', 'reload', 'back', or 'follow_link'. journey then runs that function with the given arguments and turns the results into a nice friendly standard ResultWrapper object, which is stored as 'self.result'. All exceptions other than HTTPError are unhandled. (Idea stolen straight from PBP.) """ self.last_submit_button = None if func_name == 'open': url = args[0] elif func_name == 'follow_link': # Try to find the link first url = self.find_link(args[0]) if url.find('://') == -1: url = urlparse.urljoin(self.get_url(), url) elif func_name == 'reload': url = self.get_url() elif func_name == 'back': try: self.result = self._history.pop() return except IndexError: raise TwillException if url in self._auth.keys(): auth = self._auth[url] else: auth = None r = self._session.get(url, auth = auth) if _follow_equiv_refresh(): r = self._follow_redirections(r, self._session) if func_name in ['follow_link', 'open']: # If we're really reloading and just didn't say so, don't store if self.result is not None and self.result.get_url() != r.url: self._history.append(self.result) self.result = ResultWrapper(r)