def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(
            url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(
            url='http://example.com/noenc', body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(
            url='http://example.com/latin1', body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html',
                 text='sample \xe1 text'.decode('latin1')),
        ])
Example #2
0
    def test_csviter_encoding(self):
        body1 = get_testdata('feeds', 'feed-sample4.csv')
        body2 = get_testdata('feeds', 'feed-sample5.csv')

        response = TextResponse(url="http://example.com/",
                                body=body1,
                                encoding='latin1')
        csv = csviter(response)
        self.assertEqual([row for row in csv], [{
            u'id': u'1',
            u'name': u'latin1',
            u'value': u'test'
        }, {
            u'id': u'2',
            u'name': u'something',
            u'value': u'\xf1\xe1\xe9\xf3'
        }])

        response = TextResponse(url="http://example.com/",
                                body=body2,
                                encoding='cp852')
        csv = csviter(response)
        self.assertEqual(
            [row for row in csv],
            [{
                u'id': u'1',
                u'name': u'cp852',
                u'value': u'test'
            }, {
                u'id': u'2',
                u'name': u'something',
                u'value': u'\u255a\u2569\u2569\u2569\u2550\u2550\u2557'
            }])
def _test_data(formats):
    uncompressed_body = get_testdata('compressed', 'feed-sample1.xml')
    test_responses = {}
    for format in formats:
        body = get_testdata('compressed', 'feed-sample1.' + format)
        test_responses[format] = Response('http://foo.com/bar', body=body)
    return uncompressed_body, test_responses
def _test_data(formats):
    uncompressed_body = get_testdata('compressed', 'feed-sample1.xml')
    test_responses = {}
    for format in formats:
        body = get_testdata('compressed', 'feed-sample1.' + format)
        test_responses[format] = Response('http://foo.com/bar', body=body)
    return uncompressed_body, test_responses
    def test_extraction_encoding(self):
        body = get_testdata("link_extractor", "linkextractor_noenc.html")
        response_utf8 = HtmlResponse(
            url="http://example.com/utf8", body=body, headers={"Content-Type": ["text/html; charset=utf-8"]}
        )
        response_noenc = HtmlResponse(url="http://example.com/noenc", body=body)
        body = get_testdata("link_extractor", "linkextractor_latin1.html")
        response_latin1 = HtmlResponse(url="http://example.com/latin1", body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(
            lx.extract_links(response_utf8),
            [
                Link(url="http://example.com/sample_%C3%B1.html", text=""),
                Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")),
            ],
        )

        self.assertEqual(
            lx.extract_links(response_noenc),
            [
                Link(url="http://example.com/sample_%C3%B1.html", text=""),
                Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")),
            ],
        )

        self.assertEqual(
            lx.extract_links(response_latin1),
            [
                Link(url="http://example.com/sample_%F1.html", text=""),
                Link(url="http://example.com/sample_%E1.html", text="sample \xe1 text".decode("latin1")),
            ],
        )
def _test_data(formats):
    uncompressed_body = get_testdata("compressed", "feed-sample1.xml")
    test_responses = {}
    for format in formats:
        body = get_testdata("compressed", "feed-sample1." + format)
        test_responses[format] = Response("http://foo.com/bar", body=body)
    return uncompressed_body, test_responses
    def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(
            url='http://example.com/utf8',
            body=body,
            headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc',
                                      body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1',
                                       body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8'))
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8'))
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html',
                 text='sample \xe1 text'.decode('latin1'))
        ])
    def test_csviter_encoding(self):
        body1 = get_testdata('feeds', 'feed-sample4.csv')
        body2 = get_testdata('feeds', 'feed-sample5.csv')

        response = TextResponse(url="http://example.com/", body=body1, encoding='latin1')
        csv = csviter(response)
        self.assertEqual([row for row in csv],
            [{u'id': u'1', u'name': u'latin1', u'value': u'test'},
             {u'id': u'2', u'name': u'something', u'value': u'\xf1\xe1\xe9\xf3'}])

        response = TextResponse(url="http://example.com/", body=body2, encoding='cp852')
        csv = csviter(response)
        self.assertEqual([row for row in csv],
            [{u'id': u'1', u'name': u'cp852', u'value': u'test'},
             {u'id': u'2', u'name': u'something', u'value': u'\u255a\u2569\u2569\u2569\u2550\u2550\u2557'}])
Example #9
0
    def test_csviter_falserow(self):
        body = get_testdata('feeds', 'feed-sample3.csv')
        body = '\n'.join((body, 'a,b', 'a,b,c,d'))

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        self.assertEqual([row for row in csv],
                         [{
                             u'id': u'1',
                             u'name': u'alpha',
                             u'value': u'foobar'
                         }, {
                             u'id': u'2',
                             u'name': u'unicode',
                             u'value': u'\xfan\xedc\xf3d\xe9\u203d'
                         }, {
                             u'id': u'3',
                             u'name': u'multi',
                             u'value': u'foo\nbar'
                         }, {
                             u'id': u'4',
                             u'name': u'empty',
                             u'value': u''
                         }])
Example #10
0
    def test_csviter_headers(self):
        sample = get_testdata('feeds', 'feed-sample3.csv').splitlines()
        headers, body = sample[0].split(','), '\n'.join(sample[1:])

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response, headers=headers)

        self.assertEqual([row for row in csv],
                         [{
                             u'id': u'1',
                             u'name': u'alpha',
                             u'value': u'foobar'
                         }, {
                             u'id': u'2',
                             u'name': u'unicode',
                             u'value': u'\xfan\xedc\xf3d\xe9\u203d'
                         }, {
                             u'id': u'3',
                             u'name': u'multi',
                             u'value': u'foo\nbar'
                         }, {
                             u'id': u'4',
                             u'name': u'empty',
                             u'value': u''
                         }])
Example #11
0
    def test_csviter_defaults(self):
        body = get_testdata('feeds', 'feed-sample3.csv')
        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        result = [row for row in csv]
        self.assertEqual(result, [{
            u'id': u'1',
            u'name': u'alpha',
            u'value': u'foobar'
        }, {
            u'id': u'2',
            u'name': u'unicode',
            u'value': u'\xfan\xedc\xf3d\xe9\u203d'
        }, {
            u'id': u'3',
            u'name': u'multi',
            u'value': u'foo\nbar'
        }, {
            u'id': u'4',
            u'name': u'empty',
            u'value': u''
        }])

        # explicit type check cuz' we no like stinkin' autocasting! yarrr
        for result_row in result:
            self.assert_(
                all((isinstance(k, unicode) for k in result_row.keys())))
            self.assert_(
                all((isinstance(v, unicode) for v in result_row.values())))
    def test_extraction_encoding(self):
        #TODO: use own fixtures
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body,
                        headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc',
                            body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1',
                            body=body)

        reqx = BaseSgmlRequestExtractor()
        self.failUnless(
            self._requests_equals(
                reqx.extract_requests(response_utf8),
                [ Request(url='http://example.com/sample_%C3%B1.html',
                          meta={'link_text': ''}),
                  Request(url='http://example.com/sample_%E2%82%AC.html',
                          meta={'link_text':
                                'sample \xe2\x82\xac text'.decode('utf-8')}) ]
                )
            )

        self.failUnless(
            self._requests_equals(
                reqx.extract_requests(response_noenc),
                [ Request(url='http://example.com/sample_%C3%B1.html',
                          meta={'link_text': ''}),
                  Request(url='http://example.com/sample_%E2%82%AC.html',
                          meta={'link_text':
                                'sample \xe2\x82\xac text'.decode('utf-8')}) ]
                )
            )

        self.failUnless(
            self._requests_equals(
                reqx.extract_requests(response_latin1),
                [ Request(url='http://example.com/sample_%F1.html',
                          meta={'link_text': ''}),
                  Request(url='http://example.com/sample_%E1.html',
                          meta={'link_text':
                                'sample \xe1 text'.decode('latin1')}) ]
                )
            )
    def test_extraction_encoding(self):
        #TODO: use own fixtures
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(
            url='http://example.com/utf8',
            body=body,
            headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc',
                                      body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1',
                                       body=body)

        reqx = BaseSgmlRequestExtractor()
        self.failUnless(
            self._requests_equals(reqx.extract_requests(response_utf8), [
                Request(url='http://example.com/sample_%C3%B1.html',
                        meta={'link_text': ''}),
                Request(url='http://example.com/sample_%E2%82%AC.html',
                        meta={
                            'link_text':
                            'sample \xe2\x82\xac text'.decode('utf-8')
                        })
            ]))

        self.failUnless(
            self._requests_equals(reqx.extract_requests(response_noenc), [
                Request(url='http://example.com/sample_%C3%B1.html',
                        meta={'link_text': ''}),
                Request(url='http://example.com/sample_%E2%82%AC.html',
                        meta={
                            'link_text':
                            'sample \xe2\x82\xac text'.decode('utf-8')
                        })
            ]))

        self.failUnless(
            self._requests_equals(reqx.extract_requests(response_latin1), [
                Request(url='http://example.com/sample_%F1.html',
                        meta={'link_text': ''}),
                Request(
                    url='http://example.com/sample_%E1.html',
                    meta={'link_text': 'sample \xe1 text'.decode('latin1')})
            ]))
    def test_csviter_delimiter(self):
        body = get_testdata('feeds', 'feed-sample3.csv').replace(',', '\t')
        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response, delimiter='\t')

        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': u'foo\nbar'},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
Example #15
0
    def test_csviter_delimiter_binary_response_assume_utf8_encoding(self):
        body = get_testdata('feeds', 'feed-sample3.csv').replace(',', '\t')
        response = Response(url="http://example.com/", body=body)
        csv = csviter(response, delimiter='\t')

        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': FOOBAR_NL},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
Example #16
0
    def test_csviter_exception(self):
        body = get_testdata('feeds', 'feed-sample3.csv')

        response = TextResponse(url="http://example.com/", body=body)
        iter = csviter(response)
        iter.next()
        iter.next()
        iter.next()
        iter.next()

        self.assertRaises(StopIteration, iter.next)
Example #17
0
    def test_csviter_exception(self):
        body = get_testdata("feeds", "feed-sample3.csv")

        response = TextResponse(url="http://example.com/", body=body)
        iter = csviter(response)
        iter.next()
        iter.next()
        iter.next()
        iter.next()

        self.assertRaises(StopIteration, iter.next)
Example #18
0
    def test_csviter_exception(self):
        body = get_testdata('feeds', 'feed-sample3.csv')

        response = TextResponse(url="http://example.com/", body=body)
        iter = csviter(response)
        next(iter)
        next(iter)
        next(iter)
        next(iter)

        self.assertRaises(StopIteration, next, iter)
    def test_csviter_falserow(self):
        body = get_testdata('feeds', 'feed-sample3.csv')
        body = '\n'.join((body, 'a,b', 'a,b,c,d'))

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': u'foo\nbar'},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
    def test_csviter_headers(self):
        sample = get_testdata('feeds', 'feed-sample3.csv').splitlines()
        headers, body = sample[0].split(','), '\n'.join(sample[1:])

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response, headers=headers)

        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': u'foo\nbar'},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
Example #21
0
    def test_csviter_encoding(self):
        body1 = get_testdata("feeds", "feed-sample4.csv")
        body2 = get_testdata("feeds", "feed-sample5.csv")

        response = TextResponse(url="http://example.com/", body=body1, encoding="latin1")
        csv = csviter(response)
        self.assertEqual(
            [row for row in csv],
            [
                {u"id": u"1", u"name": u"latin1", u"value": u"test"},
                {u"id": u"2", u"name": u"something", u"value": u"\xf1\xe1\xe9\xf3"},
            ],
        )

        response = TextResponse(url="http://example.com/", body=body2, encoding="cp852")
        csv = csviter(response)
        self.assertEqual(
            [row for row in csv],
            [
                {u"id": u"1", u"name": u"cp852", u"value": u"test"},
                {u"id": u"2", u"name": u"something", u"value": u"\u255a\u2569\u2569\u2569\u2550\u2550\u2557"},
            ],
        )
Example #22
0
    def test_csviter_delimiter_binary_response_assume_utf8_encoding(self):
        body = get_testdata("feeds", "feed-sample3.csv").replace(",", "\t")
        response = Response(url="http://example.com/", body=body)
        csv = csviter(response, delimiter="\t")

        self.assertEqual(
            [row for row in csv],
            [
                {u"id": u"1", u"name": u"alpha", u"value": u"foobar"},
                {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"},
                {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"},
                {u"id": u"4", u"name": u"empty", u"value": u""},
            ],
        )
Example #23
0
    def test_csviter_falserow(self):
        body = get_testdata("feeds", "feed-sample3.csv")
        body = "\n".join((body, "a,b", "a,b,c,d"))

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        self.assertEqual(
            [row for row in csv],
            [
                {u"id": u"1", u"name": u"alpha", u"value": u"foobar"},
                {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"},
                {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"},
                {u"id": u"4", u"name": u"empty", u"value": u""},
            ],
        )
Example #24
0
    def test_csviter_headers(self):
        sample = get_testdata("feeds", "feed-sample3.csv").splitlines()
        headers, body = sample[0].split(","), "\n".join(sample[1:])

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response, headers=headers)

        self.assertEqual(
            [row for row in csv],
            [
                {u"id": u"1", u"name": u"alpha", u"value": u"foobar"},
                {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"},
                {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"},
                {u"id": u"4", u"name": u"empty", u"value": u""},
            ],
        )
    def test_csviter_defaults(self):
        body = get_testdata('feeds', 'feed-sample3.csv')
        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        result = [row for row in csv]
        self.assertEqual(result,
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': u'foo\nbar'},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])

        # explicit type check cuz' we no like stinkin' autocasting! yarrr
        for result_row in result:
            self.assert_(all((isinstance(k, unicode) for k in result_row.keys())))
            self.assert_(all((isinstance(v, unicode) for v in result_row.values())))
Example #26
0
    def test_csviter_defaults(self):
        body = get_testdata("feeds", "feed-sample3.csv")
        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        result = [row for row in csv]
        self.assertEqual(
            result,
            [
                {u"id": u"1", u"name": u"alpha", u"value": u"foobar"},
                {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"},
                {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"},
                {u"id": u"4", u"name": u"empty", u"value": u""},
            ],
        )

        # explicit type check cuz' we no like stinkin' autocasting! yarrr
        for result_row in result:
            self.assert_(all((isinstance(k, unicode) for k in result_row.keys())))
            self.assert_(all((isinstance(v, unicode) for v in result_row.values())))
 def setUp(self):
     body = get_testdata('link_extractor', 'sgml_linkextractor.html')
     self.response = HtmlResponse(url='http://example.com/index', body=body)
 def setUp(self):
     body = get_testdata("link_extractor", "sgml_linkextractor.html")
     self.response = HtmlResponse(url="http://example.com/index", body=body)
 def setUp(self):
     body = get_testdata('link_extractor', 'sgml_linkextractor.html')
     self.response = HtmlResponse(url='http://example.com/index', body=body)