def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse( url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse( url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse( url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%F1.html', text=''), Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')), ])
def test_csviter_encoding(self): body1 = get_testdata('feeds', 'feed-sample4.csv') body2 = get_testdata('feeds', 'feed-sample5.csv') response = TextResponse(url="http://example.com/", body=body1, encoding='latin1') csv = csviter(response) self.assertEqual([row for row in csv], [{ u'id': u'1', u'name': u'latin1', u'value': u'test' }, { u'id': u'2', u'name': u'something', u'value': u'\xf1\xe1\xe9\xf3' }]) response = TextResponse(url="http://example.com/", body=body2, encoding='cp852') csv = csviter(response) self.assertEqual( [row for row in csv], [{ u'id': u'1', u'name': u'cp852', u'value': u'test' }, { u'id': u'2', u'name': u'something', u'value': u'\u255a\u2569\u2569\u2569\u2550\u2550\u2557' }])
def _test_data(formats): uncompressed_body = get_testdata('compressed', 'feed-sample1.xml') test_responses = {} for format in formats: body = get_testdata('compressed', 'feed-sample1.' + format) test_responses[format] = Response('http://foo.com/bar', body=body) return uncompressed_body, test_responses
def test_extraction_encoding(self): body = get_testdata("link_extractor", "linkextractor_noenc.html") response_utf8 = HtmlResponse( url="http://example.com/utf8", body=body, headers={"Content-Type": ["text/html; charset=utf-8"]} ) response_noenc = HtmlResponse(url="http://example.com/noenc", body=body) body = get_testdata("link_extractor", "linkextractor_latin1.html") response_latin1 = HtmlResponse(url="http://example.com/latin1", body=body) lx = BaseSgmlLinkExtractor() self.assertEqual( lx.extract_links(response_utf8), [ Link(url="http://example.com/sample_%C3%B1.html", text=""), Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")), ], ) self.assertEqual( lx.extract_links(response_noenc), [ Link(url="http://example.com/sample_%C3%B1.html", text=""), Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")), ], ) self.assertEqual( lx.extract_links(response_latin1), [ Link(url="http://example.com/sample_%F1.html", text=""), Link(url="http://example.com/sample_%E1.html", text="sample \xe1 text".decode("latin1")), ], )
def _test_data(formats): uncompressed_body = get_testdata("compressed", "feed-sample1.xml") test_responses = {} for format in formats: body = get_testdata("compressed", "feed-sample1." + format) test_responses[format] = Response("http://foo.com/bar", body=body) return uncompressed_body, test_responses
def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse( url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ]) self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%F1.html', text=''), Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')) ])
def test_csviter_encoding(self): body1 = get_testdata('feeds', 'feed-sample4.csv') body2 = get_testdata('feeds', 'feed-sample5.csv') response = TextResponse(url="http://example.com/", body=body1, encoding='latin1') csv = csviter(response) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'latin1', u'value': u'test'}, {u'id': u'2', u'name': u'something', u'value': u'\xf1\xe1\xe9\xf3'}]) response = TextResponse(url="http://example.com/", body=body2, encoding='cp852') csv = csviter(response) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'cp852', u'value': u'test'}, {u'id': u'2', u'name': u'something', u'value': u'\u255a\u2569\u2569\u2569\u2550\u2550\u2557'}])
def test_csviter_falserow(self): body = get_testdata('feeds', 'feed-sample3.csv') body = '\n'.join((body, 'a,b', 'a,b,c,d')) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual([row for row in csv], [{ u'id': u'1', u'name': u'alpha', u'value': u'foobar' }, { u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d' }, { u'id': u'3', u'name': u'multi', u'value': u'foo\nbar' }, { u'id': u'4', u'name': u'empty', u'value': u'' }])
def test_csviter_headers(self): sample = get_testdata('feeds', 'feed-sample3.csv').splitlines() headers, body = sample[0].split(','), '\n'.join(sample[1:]) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, headers=headers) self.assertEqual([row for row in csv], [{ u'id': u'1', u'name': u'alpha', u'value': u'foobar' }, { u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d' }, { u'id': u'3', u'name': u'multi', u'value': u'foo\nbar' }, { u'id': u'4', u'name': u'empty', u'value': u'' }])
def test_csviter_defaults(self): body = get_testdata('feeds', 'feed-sample3.csv') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) result = [row for row in csv] self.assertEqual(result, [{ u'id': u'1', u'name': u'alpha', u'value': u'foobar' }, { u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d' }, { u'id': u'3', u'name': u'multi', u'value': u'foo\nbar' }, { u'id': u'4', u'name': u'empty', u'value': u'' }]) # explicit type check cuz' we no like stinkin' autocasting! yarrr for result_row in result: self.assert_( all((isinstance(k, unicode) for k in result_row.keys()))) self.assert_( all((isinstance(v, unicode) for v in result_row.values())))
def test_extraction_encoding(self): #TODO: use own fixtures body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) reqx = BaseSgmlRequestExtractor() self.failUnless( self._requests_equals( reqx.extract_requests(response_utf8), [ Request(url='http://example.com/sample_%C3%B1.html', meta={'link_text': ''}), Request(url='http://example.com/sample_%E2%82%AC.html', meta={'link_text': 'sample \xe2\x82\xac text'.decode('utf-8')}) ] ) ) self.failUnless( self._requests_equals( reqx.extract_requests(response_noenc), [ Request(url='http://example.com/sample_%C3%B1.html', meta={'link_text': ''}), Request(url='http://example.com/sample_%E2%82%AC.html', meta={'link_text': 'sample \xe2\x82\xac text'.decode('utf-8')}) ] ) ) self.failUnless( self._requests_equals( reqx.extract_requests(response_latin1), [ Request(url='http://example.com/sample_%F1.html', meta={'link_text': ''}), Request(url='http://example.com/sample_%E1.html', meta={'link_text': 'sample \xe1 text'.decode('latin1')}) ] ) )
def test_extraction_encoding(self): #TODO: use own fixtures body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse( url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) reqx = BaseSgmlRequestExtractor() self.failUnless( self._requests_equals(reqx.extract_requests(response_utf8), [ Request(url='http://example.com/sample_%C3%B1.html', meta={'link_text': ''}), Request(url='http://example.com/sample_%E2%82%AC.html', meta={ 'link_text': 'sample \xe2\x82\xac text'.decode('utf-8') }) ])) self.failUnless( self._requests_equals(reqx.extract_requests(response_noenc), [ Request(url='http://example.com/sample_%C3%B1.html', meta={'link_text': ''}), Request(url='http://example.com/sample_%E2%82%AC.html', meta={ 'link_text': 'sample \xe2\x82\xac text'.decode('utf-8') }) ])) self.failUnless( self._requests_equals(reqx.extract_requests(response_latin1), [ Request(url='http://example.com/sample_%F1.html', meta={'link_text': ''}), Request( url='http://example.com/sample_%E1.html', meta={'link_text': 'sample \xe1 text'.decode('latin1')}) ]))
def test_csviter_delimiter(self): body = get_testdata('feeds', 'feed-sample3.csv').replace(',', '\t') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, delimiter='\t') self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_delimiter_binary_response_assume_utf8_encoding(self): body = get_testdata('feeds', 'feed-sample3.csv').replace(',', '\t') response = Response(url="http://example.com/", body=body) csv = csviter(response, delimiter='\t') self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_exception(self): body = get_testdata('feeds', 'feed-sample3.csv') response = TextResponse(url="http://example.com/", body=body) iter = csviter(response) iter.next() iter.next() iter.next() iter.next() self.assertRaises(StopIteration, iter.next)
def test_csviter_exception(self): body = get_testdata("feeds", "feed-sample3.csv") response = TextResponse(url="http://example.com/", body=body) iter = csviter(response) iter.next() iter.next() iter.next() iter.next() self.assertRaises(StopIteration, iter.next)
def test_csviter_exception(self): body = get_testdata('feeds', 'feed-sample3.csv') response = TextResponse(url="http://example.com/", body=body) iter = csviter(response) next(iter) next(iter) next(iter) next(iter) self.assertRaises(StopIteration, next, iter)
def test_csviter_falserow(self): body = get_testdata('feeds', 'feed-sample3.csv') body = '\n'.join((body, 'a,b', 'a,b,c,d')) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_headers(self): sample = get_testdata('feeds', 'feed-sample3.csv').splitlines() headers, body = sample[0].split(','), '\n'.join(sample[1:]) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, headers=headers) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_encoding(self): body1 = get_testdata("feeds", "feed-sample4.csv") body2 = get_testdata("feeds", "feed-sample5.csv") response = TextResponse(url="http://example.com/", body=body1, encoding="latin1") csv = csviter(response) self.assertEqual( [row for row in csv], [ {u"id": u"1", u"name": u"latin1", u"value": u"test"}, {u"id": u"2", u"name": u"something", u"value": u"\xf1\xe1\xe9\xf3"}, ], ) response = TextResponse(url="http://example.com/", body=body2, encoding="cp852") csv = csviter(response) self.assertEqual( [row for row in csv], [ {u"id": u"1", u"name": u"cp852", u"value": u"test"}, {u"id": u"2", u"name": u"something", u"value": u"\u255a\u2569\u2569\u2569\u2550\u2550\u2557"}, ], )
def test_csviter_delimiter_binary_response_assume_utf8_encoding(self): body = get_testdata("feeds", "feed-sample3.csv").replace(",", "\t") response = Response(url="http://example.com/", body=body) csv = csviter(response, delimiter="\t") self.assertEqual( [row for row in csv], [ {u"id": u"1", u"name": u"alpha", u"value": u"foobar"}, {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"}, {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"}, {u"id": u"4", u"name": u"empty", u"value": u""}, ], )
def test_csviter_falserow(self): body = get_testdata("feeds", "feed-sample3.csv") body = "\n".join((body, "a,b", "a,b,c,d")) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual( [row for row in csv], [ {u"id": u"1", u"name": u"alpha", u"value": u"foobar"}, {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"}, {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"}, {u"id": u"4", u"name": u"empty", u"value": u""}, ], )
def test_csviter_headers(self): sample = get_testdata("feeds", "feed-sample3.csv").splitlines() headers, body = sample[0].split(","), "\n".join(sample[1:]) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, headers=headers) self.assertEqual( [row for row in csv], [ {u"id": u"1", u"name": u"alpha", u"value": u"foobar"}, {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"}, {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"}, {u"id": u"4", u"name": u"empty", u"value": u""}, ], )
def test_csviter_defaults(self): body = get_testdata('feeds', 'feed-sample3.csv') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) result = [row for row in csv] self.assertEqual(result, [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'}, {u'id': u'4', u'name': u'empty', u'value': u''}]) # explicit type check cuz' we no like stinkin' autocasting! yarrr for result_row in result: self.assert_(all((isinstance(k, unicode) for k in result_row.keys()))) self.assert_(all((isinstance(v, unicode) for v in result_row.values())))
def test_csviter_defaults(self): body = get_testdata("feeds", "feed-sample3.csv") response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) result = [row for row in csv] self.assertEqual( result, [ {u"id": u"1", u"name": u"alpha", u"value": u"foobar"}, {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"}, {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"}, {u"id": u"4", u"name": u"empty", u"value": u""}, ], ) # explicit type check cuz' we no like stinkin' autocasting! yarrr for result_row in result: self.assert_(all((isinstance(k, unicode) for k in result_row.keys()))) self.assert_(all((isinstance(v, unicode) for v in result_row.values())))
def setUp(self): body = get_testdata('link_extractor', 'sgml_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body)
def setUp(self): body = get_testdata("link_extractor", "sgml_linkextractor.html") self.response = HtmlResponse(url="http://example.com/index", body=body)