def _pre_parse(self, http_resp): ''' @parameter http_resp: The HTTP response document that contains the HTML document inside its body. ''' SGMLParser._pre_parse(self, http_resp) assert self._baseUrl, 'The base URL must be set.'
def _handle_textarea_tag_end(self, tag): """ Handler for textarea end tag """ SGMLParser._handle_textarea_tag_end(self, tag) attrs = {'name': self._textarea_tag_name, 'value': self._textarea_data} if not self._forms: self._saved_inputs.append(attrs) else: form_obj = self._forms[-1] form_obj.addInput(attrs.items())
def _pre_parse(self, httpResponse): ''' @parameter httpResponse: The HTTP response document that contains the WML document inside its body. Init, >>> from core.data.url.httpResponse import httpResponse as httpResponse >>> u = url_object('http://www.w3af.com/') Parse a simple form, >>> form = """ ... <go method="post" href="dataReceptor.php"> ... <postfield name="clave" value="$(clave)"/> ... <postfield name="cuenta" value="$(cuenta)"/> ... <postfield name="tipdat" value="D"/> ... </go>""" >>> response = httpResponse( 200, form, {}, u, u ) >>> w = wmlParser(response) >>> w.getForms() [Form({'clave': ['$(clave)'], 'cuenta': ['$(cuenta)'], 'tipdat': ['D']})] Get the simplest link >>> response = httpResponse( 200, '<a href="/index.aspx">ASP.NET</a>', {}, u, u ) >>> w = wmlParser( response ) >>> re, parsed = w.getReferences() # # TODO: # I don't really understand why I'm getting results @ the "re". # They should really be inside the "parsed" list. # # >>> re # [] # >>> parsed[0].url_string # u'http://www.w3af.com/index.aspx' Get a link by applying regular expressions >>> response = httpResponse(200, 'header /index.aspx footer', {}, u, u) >>> w = wmlParser( response ) >>> re, parsed = w.getReferences() >>> # >>> # TODO: Shouldn't this be the other way around?! >>> # >>> re [] >>> parsed[0].url_string u'http://www.w3af.com/index.aspx' ''' SGMLParser._pre_parse(self, httpResponse) assert self._baseUrl is not None, 'The base URL must be set.'
def __init__(self, http_resp): # An internal list to be used to save input tags found # outside of the scope of a form tag. self._saved_inputs = [] # For <textarea> elems parsing self._textarea_tag_name = "" self._textarea_data = "" # For <select> elems parsing self._selects = [] # Save for using in form parsing self._source_url = http_resp.getURL() # Call parent's __init__ SGMLParser.__init__(self, http_resp)
def _handle_select_tag_end(self, tag): """ Handler for select end tag """ SGMLParser._handle_select_tag_end(self, tag) if self._forms: form_obj = self._forms[-1] for sel_name, optvalues in self._selects: # First convert to list of tuples before passing it as arg optvalues = [tuple(attrs.items()) for attrs in optvalues] form_obj.addSelect(sel_name, optvalues) # Reset selects container self._selects = []
def _handle_form_tag_start(self, tag, attrs): ''' Handle the form tags. This method also looks if there are "pending inputs" in the self._saved_inputs list and parses them. ''' SGMLParser._handle_form_tag_start(self, tag, attrs) # Get the 'method' method = attrs.get('method', 'GET').upper() # Get the action action = attrs.get('action', None) missing_or_invalid_action = action is None if not missing_or_invalid_action: action = self._decode_url(action) try: action = self._baseUrl.urlJoin(action) except ValueError: missing_or_invalid_action = True if missing_or_invalid_action: msg = ('HTMLParser found a form without an action attribute. ' 'Javascript may be used... but another option (mozilla does ' 'this) is that the form is expected to be posted back to the' ' same URL (the one that returned the HTML that we are parsing).') om.out.debug(msg) action = self._source_url # Create the form object and store everything for later use form_obj = form.Form(encoding=self._encoding) form_obj.setMethod(method) form_obj.setAction(action) self._forms.append(form_obj) # Now I verify if there are any input tags that were found # outside the scope of a form tag for inputattrs in self._saved_inputs: # Parse them just like if they were found AFTER the # form tag opening if isinstance(inputattrs, dict): self._handle_input_tag_inside_form('input', inputattrs) # All parsed, remove them. self._saved_inputs = []
def __init__(self, httpResponse): self._select_tag_name = "" SGMLParser.__init__(self, httpResponse)