Beispiel #1
0
def _get_form(response, formname, formnumber, formxpath):
    """Find the form element """
    from scrapy.selector.lxmldocument import LxmlDocument
    root = LxmlDocument(response, lxml.html.HTMLParser)
    if not root.forms:
        raise ValueError("No <form> element found in %s" % response)

    if formname is not None:
        f = root.xpath('//form[@name="%s"]' % formname)
        if f:
            return f[0]
    
    # Get form element from xpath, if not found, go up
    if formxpath is not None:
        nodes = root.xpath(formxpath)
        if nodes:
            el = nodes[0]
            while True:
                if el.tag == 'form':
                    return el
                el = el.getparent()
                if el is None:
                    break
        raise ValueError('No <form> element found with %s' % formxpath)

    # If we get here, it means that either formname was None
    # or invalid
    if formnumber is not None:
        try:
            form = root.forms[formnumber]
        except IndexError:
            raise IndexError("Form number %d not found in %s" %
                                (formnumber, response))
        else:
            return form
    def test_caching(self):
        r1 = HtmlResponse('http://www.example.com',
                          body='<html><head></head><body></body></html>')
        r2 = r1.copy()

        doc1 = LxmlDocument(r1)
        doc2 = LxmlDocument(r1)
        doc3 = LxmlDocument(r2)

        # make sure it's cached
        assert doc1 is doc2
        assert doc1 is not doc3
Beispiel #3
0
    def test_caching(self):
        r1 = HtmlResponse('http://www.example.com',
                          body='<html><head></head><body></body></html>')
        r2 = r1.copy()

        doc1 = LxmlDocument(r1)
        doc2 = LxmlDocument(r1)
        doc3 = LxmlDocument(r2)

        # make sure it's cached
        assert doc1 is doc2
        assert doc1 is not doc3

        # don't leave documents in memory to avoid wrong libxml2 leaks reports
        del doc1, doc2, doc3
 def test_null_char(self):
     # make sure bodies with null char ('\x00') don't raise a TypeError exception
     body = 'test problematic \x00 body'
     response = TextResponse(
         'http://example.com/catalog/product/blabla-123',
         headers={'Content-Type': 'text/plain; charset=utf-8'},
         body=body)
     LxmlDocument(response)
    def generate_form_requests_from_response(self, response, **kwargs):

        doc = LxmlDocument(response, HTMLParser)

        for form in doc.forms:
            for search_term in self.search_terms:
                #find input boxes and fill them out
                form_method, formdata = self._fill_form(form, search_term)

                yield FormRequest.from_response(response,
                                                formdata=formdata,
                                                method=form_method,
                                                **kwargs)
Beispiel #6
0
def _get_form(response, formname, formnumber):
    """Find the form element """
    from scrapy.selector.lxmldocument import LxmlDocument
    root = LxmlDocument(response, lxml.html.HTMLParser)
    if not root.forms:
        raise ValueError("No <form> element found in %s" % response)

    if formname is not None:
        f = root.xpath('//form[@name="%s"]' % formname)
        if f:
            return f[0]

    # If we get here, it means that either formname was None
    # or invalid
    if formnumber is not None:
        try:
            form = root.forms[formnumber]
        except IndexError:
            raise IndexError("Form number %d not found in %s" %
                             (formnumber, response))
        else:
            return form
Beispiel #7
0
def _get_form(response, formname, formnumber):
    """Find the form element """
    from scrapy.selector.lxmldocument import LxmlDocument
    root = LxmlDocument(response, lxml.html.HTMLParser)
    if not root.forms:
        raise ValueError("No <form> element found in %s" % response)

    if formname is not None:
        f = root.xpath('//form[@name="%s"]' % formname)
        if f:
            return f[0]

    # If we get here, it means that either formname was None
    # or invalid
    if formnumber is not None:
        try:
            form = root.forms[formnumber]
        except IndexError:
            raise IndexError("Form number %d not found in %s" %
                                (formnumber, response))
        else:
            return form
Beispiel #8
0
def _get_form(response, formname, formnumber, formxpath):
    """Find the form element """
    from scrapy.selector.lxmldocument import LxmlDocument
    root = LxmlDocument(response, lxml.html.HTMLParser)
    forms = root.xpath('//form')
    if not forms:
        raise ValueError("No <form> element found in %s" % response)

    if formname is not None:
        f = root.xpath('//form[@name="%s"]' % formname)
        if f:
            return f[0]

    # Get form element from xpath, if not found, go up
    if formxpath is not None:
        nodes = root.xpath(formxpath)
        if nodes:
            el = nodes[0]
            while True:
                if el.tag == 'form':
                    return el
                el = el.getparent()
                if el is None:
                    break
        raise ValueError('No <form> element found with %s' % formxpath)

    # If we get here, it means that either formname was None
    # or invalid
    if formnumber is not None:
        try:
            form = forms[formnumber]
        except IndexError:
            raise IndexError("Form number %d not found in %s" %
                             (formnumber, response))
        else:
            return form