def _get_form(response, formname, formnumber, formxpath): """Find the form element """ from scrapy.selector.lxmldocument import LxmlDocument root = LxmlDocument(response, lxml.html.HTMLParser) if not root.forms: raise ValueError("No <form> element found in %s" % response) if formname is not None: f = root.xpath('//form[@name="%s"]' % formname) if f: return f[0] # Get form element from xpath, if not found, go up if formxpath is not None: nodes = root.xpath(formxpath) if nodes: el = nodes[0] while True: if el.tag == 'form': return el el = el.getparent() if el is None: break raise ValueError('No <form> element found with %s' % formxpath) # If we get here, it means that either formname was None # or invalid if formnumber is not None: try: form = root.forms[formnumber] except IndexError: raise IndexError("Form number %d not found in %s" % (formnumber, response)) else: return form
def test_caching(self): r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>') r2 = r1.copy() doc1 = LxmlDocument(r1) doc2 = LxmlDocument(r1) doc3 = LxmlDocument(r2) # make sure it's cached assert doc1 is doc2 assert doc1 is not doc3
def test_caching(self): r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>') r2 = r1.copy() doc1 = LxmlDocument(r1) doc2 = LxmlDocument(r1) doc3 = LxmlDocument(r2) # make sure it's cached assert doc1 is doc2 assert doc1 is not doc3 # don't leave documents in memory to avoid wrong libxml2 leaks reports del doc1, doc2, doc3
def test_null_char(self): # make sure bodies with null char ('\x00') don't raise a TypeError exception body = 'test problematic \x00 body' response = TextResponse( 'http://example.com/catalog/product/blabla-123', headers={'Content-Type': 'text/plain; charset=utf-8'}, body=body) LxmlDocument(response)
def generate_form_requests_from_response(self, response, **kwargs): doc = LxmlDocument(response, HTMLParser) for form in doc.forms: for search_term in self.search_terms: #find input boxes and fill them out form_method, formdata = self._fill_form(form, search_term) yield FormRequest.from_response(response, formdata=formdata, method=form_method, **kwargs)
def _get_form(response, formname, formnumber): """Find the form element """ from scrapy.selector.lxmldocument import LxmlDocument root = LxmlDocument(response, lxml.html.HTMLParser) if not root.forms: raise ValueError("No <form> element found in %s" % response) if formname is not None: f = root.xpath('//form[@name="%s"]' % formname) if f: return f[0] # If we get here, it means that either formname was None # or invalid if formnumber is not None: try: form = root.forms[formnumber] except IndexError: raise IndexError("Form number %d not found in %s" % (formnumber, response)) else: return form
def _get_form(response, formname, formnumber, formxpath): """Find the form element """ from scrapy.selector.lxmldocument import LxmlDocument root = LxmlDocument(response, lxml.html.HTMLParser) forms = root.xpath('//form') if not forms: raise ValueError("No <form> element found in %s" % response) if formname is not None: f = root.xpath('//form[@name="%s"]' % formname) if f: return f[0] # Get form element from xpath, if not found, go up if formxpath is not None: nodes = root.xpath(formxpath) if nodes: el = nodes[0] while True: if el.tag == 'form': return el el = el.getparent() if el is None: break raise ValueError('No <form> element found with %s' % formxpath) # If we get here, it means that either formname was None # or invalid if formnumber is not None: try: form = forms[formnumber] except IndexError: raise IndexError("Form number %d not found in %s" % (formnumber, response)) else: return form