Esempio n. 1
0
    def test_xmliter_unicode(self):
        # example taken from https://github.com/scrapy/scrapy/issues/1665
        body = u"""<?xml version="1.0" encoding="UTF-8"?>
            <þingflokkar>
               <þingflokkur id="26">
                  <heiti />
                  <skammstafanir>
                     <stuttskammstöfun>-</stuttskammstöfun>
                     <löngskammstöfun />
                  </skammstafanir>
                  <tímabil>
                     <fyrstaþing>80</fyrstaþing>
                  </tímabil>
               </þingflokkur>
               <þingflokkur id="21">
                  <heiti>Alþýðubandalag</heiti>
                  <skammstafanir>
                     <stuttskammstöfun>Ab</stuttskammstöfun>
                     <löngskammstöfun>Alþb.</löngskammstöfun>
                  </skammstafanir>
                  <tímabil>
                     <fyrstaþing>76</fyrstaþing>
                     <síðastaþing>123</síðastaþing>
                  </tímabil>
               </þingflokkur>
               <þingflokkur id="27">
                  <heiti>Alþýðuflokkur</heiti>
                  <skammstafanir>
                     <stuttskammstöfun>A</stuttskammstöfun>
                     <löngskammstöfun>Alþfl.</löngskammstöfun>
                  </skammstafanir>
                  <tímabil>
                     <fyrstaþing>27</fyrstaþing>
                     <síðastaþing>120</síðastaþing>
                  </tímabil>
               </þingflokkur>
            </þingflokkar>"""

        for r in (
                # with bytes
                XmlResponse(url="http://example.com",
                            body=body.encode('utf-8')),
                # Unicode body needs encoding information
                XmlResponse(url="http://example.com",
                            body=body,
                            encoding='utf-8'),
        ):
            attrs = []
            for x in self.xmliter(r, u'þingflokkur'):
                attrs.append(
                    (x.attrib['id'],
                     x.xpath(
                         u'./skammstafanir/stuttskammstöfun/text()').getall(),
                     x.xpath(u'./tímabil/fyrstaþing/text()').getall()))

            self.assertEqual(attrs, [(u'26', [u'-'], [u'80']),
                                     (u'21', [u'Ab'], [u'76']),
                                     (u'27', [u'A'], [u'27'])])
 def parse_field(self, html, fn):
     response = XmlResponse('http://localhost/test.html',
                            body='<book><row>%s</row></book>' % html)
     row = response.css('row')[0]
     node = response.css('entry')[0]
     declaration = Loader(self.spider, response, LobbyistDeclaration(), row)
     declaration.add_value(None, fn(node))
     item = declaration.load_item()
     actual = dict(item)
     return actual
Esempio n. 3
0
 def parse_field(self, html, fn):
     response = XmlResponse('http://localhost/test.html',
                            body='<book><row>%s</row></book>' % html)
     row = response.css('row')[0]
     node = response.css('entry')[0]
     declaration = Loader(self.spider, response, LobbyistDeclaration(), row)
     declaration.add_value(None, fn(node))
     item = declaration.load_item()
     actual = dict(item)
     return actual
    def test_parse_declaration_xml_4_columns(self):
        # this format was used for 2012 and 2013 declarations
        response = XmlResponse('http://old.vtek.lt/vtek/.../deklaracija2012.doc',
                               body=fixture('lobist_veiklos_atatskaita_2012.doc.xml'))
        response.request = scrapy.Request(response.url)
        response.request.meta['year'] = '2012'

        items = list(self.spider.parse_declaration_xml(response))
        self.assertEqual(len(items), 30)
        self.assertEqual(items[0]['name'], 'ROMAS STUMBRYS')
        self.assertEqual(items[0]['comments'], u'Lobistinės veiklos nevykdė')
        self.assertEqual(items[0]['year'], '2012')
        self.assertEqual(items[0]['source_url'], response.url)
Esempio n. 5
0
    def test_parse_declaration_xml_4_columns(self):
        # this format was used for 2012 and 2013 declarations
        response = XmlResponse(
            'http://old.vtek.lt/vtek/.../deklaracija2012.doc',
            body=fixture('lobist_veiklos_atatskaita_2012.doc.xml'))
        response.request = scrapy.Request(response.url)
        response.request.meta['year'] = '2012'

        items = list(self.spider.parse_declaration_xml(response))
        self.assertEqual(len(items), 30)
        self.assertEqual(items[0]['name'], 'ROMAS STUMBRYS')
        self.assertEqual(items[0]['comments'], u'Lobistinės veiklos nevykdė')
        self.assertEqual(items[0]['year'], '2012')
        self.assertEqual(items[0]['source_url'], response.url)
    def test_parse_declaration_xml_5_columns(self):
        # this format was used for 2014 declarations
        response = XmlResponse('http://old.vtek.lt/vtek/.../deklaracija2014.doc',
                               body=fixture('Info_apie_lobistu_ataskaitas_2014_2015_04_08.doc.xml'))
        response.request = scrapy.Request(response.url)
        response.request.meta['year'] = '2014'

        items = list(self.spider.parse_declaration_xml(response))
        self.assertEqual(len(items), 34)
        self.assertEqual(items[0]['name'], 'ROMAS STUMBRYS')
        self.assertEqual(items[0]['year'], '2014')
        self.assertEqual(items[0]['source_url'], response.url)
        self.assertEqual(items[-1]['name'], u'UAB INLINEN')
        self.assertEqual(items[-1]['comments'], u'Lobistinės veiklos nevykdė')
    def test_xmliter_namespaces(self):
        body = b"""\
            <?xml version="1.0" encoding="UTF-8"?>
            <rss version="2.0" xmlns:g="http://base.google.com/ns/1.0">
                <channel>
                <title>My Dummy Company</title>
                <link>http://www.mydummycompany.com</link>
                <description>This is a dummy company. We do nothing.</description>
                <item>
                    <title>Item 1</title>
                    <description>This is item 1</description>
                    <link>http://www.mydummycompany.com/items/1</link>
                    <g:image_link>http://www.mydummycompany.com/images/item1.jpg</g:image_link>
                    <g:id>ITEM_1</g:id>
                    <g:price>400</g:price>
                </item>
                </channel>
            </rss>
        """
        response = XmlResponse(url='http://mydummycompany.com', body=body)
        my_iter = self.xmliter(response, 'item')

        node = next(my_iter)
        node.register_namespace('g', 'http://base.google.com/ns/1.0')
        self.assertEqual(node.xpath('title/text()').getall(), ['Item 1'])
        self.assertEqual(node.xpath('description/text()').getall(), ['This is item 1'])
        self.assertEqual(node.xpath('link/text()').getall(), ['http://www.mydummycompany.com/items/1'])
        self.assertEqual(node.xpath('g:image_link/text()').getall(), ['http://www.mydummycompany.com/images/item1.jpg'])
        self.assertEqual(node.xpath('g:id/text()').getall(), ['ITEM_1'])
        self.assertEqual(node.xpath('g:price/text()').getall(), ['400'])
        self.assertEqual(node.xpath('image_link/text()').getall(), [])
        self.assertEqual(node.xpath('id/text()').getall(), [])
        self.assertEqual(node.xpath('price/text()').getall(), [])
    def test_xmliter_namespaces_prefix(self):
        body = b"""\
        <?xml version="1.0" encoding="UTF-8"?>
        <root>
            <h:table xmlns:h="http://www.w3.org/TR/html4/">
              <h:tr>
                <h:td>Apples</h:td>
                <h:td>Bananas</h:td>
              </h:tr>
            </h:table>

            <f:table xmlns:f="http://www.w3schools.com/furniture">
              <f:name>African Coffee Table</f:name>
              <f:width>80</f:width>
              <f:length>120</f:length>
            </f:table>

        </root>
        """
        response = XmlResponse(url='http://mydummycompany.com', body=body)
        my_iter = self.xmliter(response, 'table', 'http://www.w3.org/TR/html4/', 'h')

        node = next(my_iter)
        self.assertEqual(len(node.xpath('h:tr/h:td').getall()), 2)
        self.assertEqual(node.xpath('h:tr/h:td[1]/text()').getall(), ['Apples'])
        self.assertEqual(node.xpath('h:tr/h:td[2]/text()').getall(), ['Bananas'])

        my_iter = self.xmliter(response, 'table', 'http://www.w3schools.com/furniture', 'f')

        node = next(my_iter)
        self.assertEqual(node.xpath('f:name/text()').getall(), ['African Coffee Table'])
Esempio n. 9
0
    def test_selector_namespaces_multiple(self):
        body = """<?xml version="1.0" encoding="UTF-8"?>
<BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
            xmlns:b="http://somens.com"
            xmlns:p="http://www.scrapy.org/product" >
    <b:Operation>hello</b:Operation>
    <TestTag b:att="value"><Other>value</Other></TestTag>
    <p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag>
</BrowseNode>
        """
        response = XmlResponse(url="http://example.com", body=body)
        x = self.xxs_cls(response)

        x.register_namespace(
            "xmlns",
            "http://webservices.amazon.com/AWSECommerceService/2005-10-05")
        x.register_namespace("p", "http://www.scrapy.org/product")
        x.register_namespace("b", "http://somens.com")
        self.assertEqual(len(x.select("//xmlns:TestTag")), 1)
        self.assertEqual(
            x.select("//b:Operation/text()").extract()[0], 'hello')
        self.assertEqual(
            x.select("//xmlns:TestTag/@b:att").extract()[0], 'value')
        self.assertEqual(
            x.select("//p:SecondTestTag/xmlns:price/text()").extract()[0],
            '90')
        self.assertEqual(
            x.select("//p:SecondTestTag").select("./xmlns:price/text()")
            [0].extract(), '90')
        self.assertEqual(
            x.select("//p:SecondTestTag/xmlns:material/text()").extract()[0],
            'iron')
Esempio n. 10
0
 def test_invalid_xpath_unicode(self):
     "Test *Unicode* invalid xpath raises ValueError with the invalid xpath"
     response = XmlResponse(url="http://example.com", body="<html></html>")
     x = self.sscls(response)
     xpath = u"//test[@foo='\u0431ar]"
     encoded = xpath if six.PY3 else xpath.encode('unicode_escape')
     self.assertRaisesRegexp(ValueError, re.escape(encoded), x.xpath, xpath)
Esempio n. 11
0
 def test_xmliter_encoding(self):
     body = b'<?xml version="1.0" encoding="ISO-8859-9"?>\n<xml>\n    <item>Some Turkish Characters \xd6\xc7\xde\xdd\xd0\xdc \xfc\xf0\xfd\xfe\xe7\xf6</item>\n</xml>\n\n'
     response = XmlResponse('http://www.example.com', body=body)
     self.assertEqual(
         next(self.xmliter(response, 'item')).get(),
         u'<item>Some Turkish Characters \xd6\xc7\u015e\u0130\u011e\xdc \xfc\u011f\u0131\u015f\xe7\xf6</item>'
     )
Esempio n. 12
0
    def test_parse_declaration_xml_5_columns(self):
        # this format was used for 2014 declarations
        response = XmlResponse(
            'http://old.vtek.lt/vtek/.../deklaracija2014.doc',
            body=fixture(
                'Info_apie_lobistu_ataskaitas_2014_2015_04_08.doc.xml'))
        response.request = scrapy.Request(response.url)
        response.request.meta['year'] = '2014'

        items = list(self.spider.parse_declaration_xml(response))
        self.assertEqual(len(items), 34)
        self.assertEqual(items[0]['name'], 'ROMAS STUMBRYS')
        self.assertEqual(items[0]['year'], '2014')
        self.assertEqual(items[0]['source_url'], response.url)
        self.assertEqual(items[-1]['name'], u'UAB INLINEN')
        self.assertEqual(items[-1]['comments'], u'Lobistinės veiklos nevykdė')
Esempio n. 13
0
    def test_xmliter(self):
        body = b"""
            <?xml version="1.0" encoding="UTF-8"?>
            <products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                      xsi:noNamespaceSchemaLocation="someschmea.xsd">
              <product id="001">
                <type>Type 1</type>
                <name>Name 1</name>
              </product>
              <product id="002">
                <type>Type 2</type>
                <name>Name 2</name>
              </product>
            </products>
        """

        response = XmlResponse(url="http://example.com", body=body)
        attrs = []
        for x in self.xmliter(response, "product"):
            attrs.append((
                x.attrib["id"],
                x.xpath("name/text()").getall(),
                x.xpath("./type/text()").getall(),
            ))

        self.assertEqual(attrs, [("001", ["Name 1"], ["Type 1"]),
                                 ("002", ["Name 2"], ["Type 2"])])
Esempio n. 14
0
        def test_xhtml(self):
            xhtml = b"""
    <?xml version="1.0"?>
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
    <head>
        <title>XHTML document title</title>
    </head>
    <body>
        <div class='links'>
        <p><a href="/about.html">About us</a></p>
        </div>
        <div>
        <p><a href="/follow.html">Follow this link</a></p>
        </div>
        <div>
        <p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p>
        </div>
        <div>
        <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
        </div>
        <div>
        <p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
        </div>
    </body>
    </html>
            """

            response = HtmlResponse("http://example.com/index.xhtml", body=xhtml)

            lx = self.extractor_cls()
            self.assertEqual(
                lx.extract_links(response),
                [
                    Link(url='http://example.com/about.html', text='About us', fragment='', nofollow=False),
                    Link(url='http://example.com/follow.html', text='Follow this link', fragment='', nofollow=False),
                    Link(url='http://example.com/nofollow.html', text='Dont follow this one',
                         fragment='', nofollow=True),
                    Link(url='http://example.com/nofollow2.html', text='Choose to follow or not',
                         fragment='', nofollow=False),
                    Link(url='http://google.com/something', text='External link not to follow', nofollow=True),
                ]
            )

            response = XmlResponse("http://example.com/index.xhtml", body=xhtml)

            lx = self.extractor_cls()
            self.assertEqual(
                lx.extract_links(response),
                [
                    Link(url='http://example.com/about.html', text='About us', fragment='', nofollow=False),
                    Link(url='http://example.com/follow.html', text='Follow this link', fragment='', nofollow=False),
                    Link(url='http://example.com/nofollow.html', text='Dont follow this one',
                         fragment='', nofollow=True),
                    Link(url='http://example.com/nofollow2.html', text='Choose to follow or not',
                         fragment='', nofollow=False),
                    Link(url='http://google.com/something', text='External link not to follow', nofollow=True),
                ]
            )
Esempio n. 15
0
def test_spider_parse_cities():
    response = XmlResponse('https://example.com/example/',
                           body=Path(FIXTURES_DIR /
                                     'feed_cities.xml').read_bytes())
    job = next(startupjobs.Spider().parse(response))

    assert job['locations_raw'] == ['Praha, Česko', 'Olomouc, Česko']
Esempio n. 16
0
def test_spider_parse_job_types():
    response = XmlResponse('https://example.com/example/',
                           body=Path(FIXTURES_DIR /
                                     'feed_job_types.xml').read_bytes())
    job = next(startupjobs.Spider().parse(response))

    assert job['employment_types'] == ['Full-time', 'External collaboration']
Esempio n. 17
0
    def test_xmliter_iterate_namespace(self):
        body = """\
            <?xml version="1.0" encoding="UTF-8"?>
            <rss version="2.0" xmlns="http://base.google.com/ns/1.0">
                <channel>
                <title>My Dummy Company</title>
                <link>http://www.mydummycompany.com</link>
                <description>This is a dummy company. We do nothing.</description>
                <item>
                    <title>Item 1</title>
                    <description>This is item 1</description>
                    <link>http://www.mydummycompany.com/items/1</link>
                    <image_link>http://www.mydummycompany.com/images/item1.jpg</image_link>
                    <image_link>http://www.mydummycompany.com/images/item2.jpg</image_link>
                </item>
                </channel>
            </rss>
        """
        response = XmlResponse(url='http://mydummycompany.com', body=body)

        no_namespace_iter = self.xmliter(response, 'image_link')
        self.assertEqual(len(list(no_namespace_iter)), 0)

        namespace_iter = self.xmliter(response, 'image_link',
                                      'http://base.google.com/ns/1.0')
        node = namespace_iter.next()
        self.assertEqual(
            node.select('text()').extract(),
            ['http://www.mydummycompany.com/images/item1.jpg'])
        node = namespace_iter.next()
        self.assertEqual(
            node.select('text()').extract(),
            ['http://www.mydummycompany.com/images/item2.jpg'])
Esempio n. 18
0
    def test_xmliter_iterate_namespace(self):
        body = b"""
            <?xml version="1.0" encoding="UTF-8"?>
            <rss version="2.0" xmlns="http://base.google.com/ns/1.0">
                <channel>
                <title>My Dummy Company</title>
                <link>http://www.mydummycompany.com</link>
                <description>This is a dummy company. We do nothing.</description>
                <item>
                    <title>Item 1</title>
                    <description>This is item 1</description>
                    <link>http://www.mydummycompany.com/items/1</link>
                    <image_link>http://www.mydummycompany.com/images/item1.jpg</image_link>
                    <image_link>http://www.mydummycompany.com/images/item2.jpg</image_link>
                </item>
                </channel>
            </rss>
        """
        response = XmlResponse(url="http://mydummycompany.com", body=body)

        no_namespace_iter = self.xmliter(response, "image_link")
        self.assertEqual(len(list(no_namespace_iter)), 0)

        namespace_iter = self.xmliter(response, "image_link",
                                      "http://base.google.com/ns/1.0")
        node = next(namespace_iter)
        self.assertEqual(
            node.xpath("text()").getall(),
            ["http://www.mydummycompany.com/images/item1.jpg"],
        )
        node = next(namespace_iter)
        self.assertEqual(
            node.xpath("text()").getall(),
            ["http://www.mydummycompany.com/images/item2.jpg"],
        )
Esempio n. 19
0
def test_spider_parse_html_entities():
    response = XmlResponse('https://example.com/example/',
                           body=Path(FIXTURES_DIR /
                                     'feed_html_entities.xml').read_bytes())
    job = next(startupjobs.Spider().parse(response))

    assert job['title'] == 'Analytik&programátor Junior'
    assert job['company_name'] == 'P&J Capital'
Esempio n. 20
0
 def test_selector_invalid_xpath(self):
     response = XmlResponse(url="http://example.com", body="<html></html>")
     x = self.hxs_cls(response)
     xpath = "//test[@foo='bar]"
     try:
         x.select(xpath)
     except ValueError, e:
         assert xpath in str(e), "Exception message does not contain invalid xpath"
Esempio n. 21
0
    def test_get_sitemap_body(self):
        r = XmlResponse(url="http://www.example.com/", body=self.BODY)
        self.assertSitemapBody(r, self.BODY)

        r = HtmlResponse(url="http://www.example.com/", body=self.BODY)
        self.assertSitemapBody(r, None)

        r = Response(url="http://www.example.com/favicon.ico", body=self.BODY)
        self.assertSitemapBody(r, None)
Esempio n. 22
0
def test_spider_parse_cities_job_objects_are_copies():
    response = XmlResponse('https://example.com/example/',
                            body=Path(FIXTURES_DIR / 'feed_cities.xml').read_bytes())
    jobs = list(startupjobs.Spider().parse(response))
    jobs[0]['title'] = 'Modified'

    assert jobs[0]['link'] == jobs[1]['link']
    assert jobs[0]['title'] == 'Modified'
    assert jobs[1]['title'] == 'Server / Cloud / DevOps Admin'
Esempio n. 23
0
    def test_parse_xml_report(self):
        '''Parse XML 10-Q or 10-K report.'''
        spider = EdgarSpider()
        spider._follow_links = True  # HACK

        body = '''
            <?xml version="1.0">
            <xbrl xmlns="http://www.xbrl.org/2003/instance"
                  xmlns:xbrli="http://www.xbrl.org/2003/instance"
                  xmlns:dei="http://xbrl.sec.gov/dei/2011-01-31"
                  xmlns:us-gaap="http://fasb.org/us-gaap/2011-01-31">

              <context id="c1">
                <startDate>2013-03-31</startDate>
                <endDate>2013-06-28</endDate>
              </context>

              <dei:AmendmentFlag contextRef="c1">false</dei:AmendmentFlag>
              <dei:DocumentType contextRef="c1">10-Q</dei:DocumentType>
              <dei:DocumentFiscalPeriodFocus contextRef="c1">Q2</dei:DocumentFiscalPeriodFocus>
              <dei:DocumentPeriodEndDate contextRef="c1">2013-06-28</dei:DocumentPeriodEndDate>
              <dei:DocumentFiscalYearFocus>2013</dei>

              <us-gaap:Revenues contextRef="c1">100</us-gaap:Revenues>
              <us-gaap:NetIncomeLoss contextRef="c1">200</us-gaap:NetIncomeLoss>
              <us-gaap:EarningsPerShareBasic contextRef="c1">0.2</us-gaap:EarningsPerShareBasic>
              <us-gaap:EarningsPerShareDiluted contextRef="c1">0.19</us-gaap:EarningsPerShareDiluted>
              <us-gaap:CommonStockDividendsPerShareDeclared contextRef="c1">0.07</us-gaap:CommonStockDividendsPerShareDeclared>

              <us-gaap:Assets contextRef="c1">1600</us-gaap:Assets>
              <us-gaap:StockholdersEquity contextRef="c1">300</us-gaap:StockholdersEquity>
              <us-gaap:CashAndCashEquivalentsAtCarryingValue contextRef="c1">150</us-gaap:CashAndCashEquivalentsAtCarryingValue>
            </xbrl>
        '''

        response = XmlResponse(
            'http://sec.gov/Archives/edgar/data/123/abc-20130720.xml',
            body=body)
        item = spider.parse_10qk(response)

        self.assert_item(
            item, {
                'symbol': 'ABC',
                'amend': False,
                'doc_type': '10-Q',
                'period_focus': 'Q2',
                'fiscal_year': 2013,
                'end_date': '2013-06-28',
                'revenues': 100.0,
                'net_income': 200.0,
                'eps_basic': 0.2,
                'eps_diluted': 0.19,
                'dividend': 0.07,
                'assets': 1600.0,
                'equity': 300.0,
                'cash': 150.0
            })
Esempio n. 24
0
    def test_xml_entity_expansion(self):
        malicious_xml = '<?xml version="1.0" encoding="ISO-8859-1"?>'\
            '<!DOCTYPE foo [ <!ELEMENT foo ANY > <!ENTITY xxe SYSTEM '\
            '"file:///etc/passwd" >]><foo>&xxe;</foo>'

        response = XmlResponse('http://example.com', body=malicious_xml)
        sel = self.sscls(response=response)

        self.assertEqual(sel.extract(), '<foo>&xxe;</foo>')
 def test_xmliter_unusual_node(self):
     body = b"""<?xml version="1.0" encoding="UTF-8"?>
         <root>
             <matchme...></matchme...>
             <matchmenot></matchmenot>
         </root>
     """
     response = XmlResponse(url="http://example.com", body=body)
     nodenames = [e.xpath('name()').getall() for e in self.xmliter(response, 'matchme...')]
     self.assertEqual(nodenames, [['matchme...']])
Esempio n. 26
0
    def test_flavor_detection(self):
        text = b'<div><img src="a.jpg"><p>Hello</div>'
        sel = Selector(XmlResponse('http://example.com', body=text, encoding='utf-8'))
        self.assertEqual(sel.type, 'xml')
        self.assertEqual(sel.xpath("//div").getall(),
                         [u'<div><img src="a.jpg"><p>Hello</p></img></div>'])

        sel = Selector(HtmlResponse('http://example.com', body=text, encoding='utf-8'))
        self.assertEqual(sel.type, 'html')
        self.assertEqual(sel.xpath("//div").getall(),
                         [u'<div><img src="a.jpg"><p>Hello</p></div>'])
Esempio n. 27
0
 def parse_single(self, response):
     sickle = Sickle(self.url)
     params = {
         'metadataPrefix': self.format,
         'identifier': response.meta['identifier'],
     }
     record = sickle.GetRecord(**params)
     self._crawled_records[params['identifier']] = record
     response = XmlResponse(self.url, encoding='utf-8', body=record.raw)
     selector = Selector(response, type='xml')
     return self.parse_record(selector)
Esempio n. 28
0
    def test_remove_attributes_namespaces(self):
        xml = """<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns:atom="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/">
  <link atom:type="text/html">
  <link atom:type="application/atom+xml">
</feed>
"""
        sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
        self.assertEqual(len(sel.xpath("//link/@type")), 0)
        sel.remove_namespaces()
        self.assertEqual(len(sel.xpath("//link/@type")), 2)
Esempio n. 29
0
    def test_flavor_detection(self):
        text = '<div><img src="a.jpg"><p>Hello</div>'
        sel = self.sscls(XmlResponse('http://example.com', body=text))
        self.assertEqual(sel.type, 'xml')
        self.assertEqual(sel.xpath("//div").extract(),
                         [u'<div><img src="a.jpg"><p>Hello</p></img></div>'])

        sel = self.sscls(HtmlResponse('http://example.com', body=text))
        self.assertEqual(sel.type, 'html')
        self.assertEqual(sel.xpath("//div").extract(),
                         [u'<div><img src="a.jpg"><p>Hello</p></div>'])
Esempio n. 30
0
    def test_remove_namespaces(self):
        xml = """<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/">
  <link type="text/html">
  <link type="application/atom+xml">
</feed>
"""
        xxs = XmlXPathSelector(
            XmlResponse("http://example.com/feed.atom", body=xml))
        self.assertEqual(len(xxs.select("//link")), 0)
        xxs.remove_namespaces()
        self.assertEqual(len(xxs.select("//link")), 2)
Esempio n. 31
0
 def test_invalid_xpath(self):
     response = XmlResponse(url="http://example.com", body="<html></html>")
     x = self.sscls(response)
     xpath = "//test[@foo='bar]"
     try:
         x.xpath(xpath)
     except ValueError as e:
         assert xpath in str(e), "Exception message does not contain invalid xpath"
     except Exception:
         raise AssertionError("A invalid XPath does not raise ValueError")
     else:
         raise AssertionError("A invalid XPath does not raise an exception")
Esempio n. 32
0
    def test_selector_namespaces_simple(self):
        body = """
        <test xmlns:somens="http://scrapy.org">
           <somens:a id="foo">take this</a>
           <a id="bar">found</a>
        </test>
        """

        response = XmlResponse(url="http://example.com", body=body)
        x = self.xxs_cls(response)

        x.register_namespace("somens", "http://scrapy.org")
        self.assertEqual(x.select("//somens:a/text()").extract(),
                         [u'take this'])
Esempio n. 33
0
    def test_links_from_sitemap(self):
        body = open(join(_PATH, "data", "sitemap_sample.xml")).read()
        response = XmlResponse(url="http://example.com/sample.xml", body=body,
                headers={'Content-Type': "text/xml; charset=UTF-8"})

        name = "sitemaps"
        spider = self.smanager.create(name)

        urls = [r.url for r in spider.parse(response)]
        self.assertEqual(len(urls), 3)
        self.assertEqual(set(urls), set([
                "https://www.siliconrepublic.com/post-sitemap1.xml",
                "https://www.siliconrepublic.com/post-sitemap2.xml",
                "https://www.siliconrepublic.com/post-sitemap3.xml"]))
 def parse_law_projects(self, html, fn):
     response = XmlResponse('http://localhost/test.html',
                            body='<book>%s</book>' % html)
     node = response.css('entry')[0]
     return fn(node)