def _pre_parse(self, http_resp): ''' :param http_resp: The HTTP response document that contains the HTML document inside its body. ''' SGMLParser._pre_parse(self, http_resp) assert self._base_url, 'The base URL must be set.'
def _pre_parse(self, HTTPResponse): ''' :param HTTPResponse: The HTTP response document that contains the WML document inside its body. ''' SGMLParser._pre_parse(self, HTTPResponse) assert self._base_url is not None, 'The base URL must be set.'
def __init__(self, http_resp): # Save "_parse" reference orig_parse = self._parse # Monkeypatch it! self._parse = lambda arg: None # Now call parent's __init__ SGMLParser.__init__(self, http_resp) # Restore it self._parse = orig_parse
def _handle_textarea_tag_end(self, tag): """ Handler for textarea end tag """ SGMLParser._handle_textarea_tag_end(self, tag) attrs = {'name': self._textarea_tag_name, 'value': self._textarea_data} if not self._forms: self._saved_inputs.append(attrs) else: form_obj = self._forms[-1] form_obj.add_input(attrs.items())
def __init__(self, http_resp): # An internal list to be used to save input tags found # outside of the scope of a form tag. self._saved_inputs = [] # For <textarea> elems parsing self._textarea_tag_name = "" self._textarea_data = "" # For <select> elems parsing self._selects = [] # Save for using in form parsing self._source_url = http_resp.get_url() # Call parent's __init__ SGMLParser.__init__(self, http_resp)
def _handle_select_tag_end(self, tag): """ Handler for select end tag """ SGMLParser._handle_select_tag_end(self, tag) if self._forms: form_obj = self._forms[-1] for sel_name, optvalues in self._selects: # First convert to list of tuples before passing it as arg optvalues = [tuple(attrs.items()) for attrs in optvalues] form_obj.add_select(sel_name, optvalues) # Reset selects container self._selects = []
def _handle_form_tag_start(self, tag, attrs): ''' Handle the form tags. This method also looks if there are "pending inputs" in the self._saved_inputs list and parses them. ''' SGMLParser._handle_form_tag_start(self, tag, attrs) # Get the 'method' method = attrs.get('method', 'GET').upper() # Get the action action = attrs.get('action', None) missing_or_invalid_action = action is None if not missing_or_invalid_action: action = self._decode_url(action) try: action = self._base_url.url_join(action, encoding=self._encoding) except ValueError: missing_or_invalid_action = True if missing_or_invalid_action: msg = ( 'HTMLParser found a form without an action attribute. ' 'Javascript may be used... but another option (mozilla does ' 'this) is that the form is expected to be posted back to the' ' same URL (the one that returned the HTML that we are parsing).' ) om.out.debug(msg) action = self._source_url # Create the form object and store everything for later use form_obj = form.Form(encoding=self._encoding) form_obj.set_method(method) form_obj.set_action(action) self._forms.append(form_obj) # Now I verify if there are any input tags that were found # outside the scope of a form tag for inputattrs in self._saved_inputs: # Parse them just like if they were found AFTER the # form tag opening if isinstance(inputattrs, dict): self._handle_input_tag_inside_form('input', inputattrs) # All parsed, remove them. self._saved_inputs = []
def _handle_form_tag_start(self, tag, attrs): ''' Handle the form tags. This method also looks if there are "pending inputs" in the self._saved_inputs list and parses them. ''' SGMLParser._handle_form_tag_start(self, tag, attrs) # Get the 'method' method = attrs.get('method', 'GET').upper() # Get the action action = attrs.get('action', None) missing_or_invalid_action = action is None if not missing_or_invalid_action: action = self._decode_url(action) try: action = self._base_url.url_join(action, encoding=self._encoding) except ValueError: missing_or_invalid_action = True if missing_or_invalid_action: msg = ('HTMLParser found a form without an action attribute. ' 'Javascript may be used... but another option (mozilla does ' 'this) is that the form is expected to be posted back to the' ' same URL (the one that returned the HTML that we are parsing).') om.out.debug(msg) action = self._source_url # Create the form object and store everything for later use form_obj = form.Form(encoding=self._encoding) form_obj.set_method(method) form_obj.set_action(action) self._forms.append(form_obj) # Now I verify if there are any input tags that were found # outside the scope of a form tag for inputattrs in self._saved_inputs: # Parse them just like if they were found AFTER the # form tag opening if isinstance(inputattrs, dict): self._handle_input_tag_inside_form('input', inputattrs) # All parsed, remove them. self._saved_inputs = []
def __init__(self, HTTPResponse): self._select_tag_name = "" SGMLParser.__init__(self, HTTPResponse)