def test_xpath_result_not_list(element): extractor = XPathExtractor("normalize-space(//span)") assert extractor.extract(element) == "a" with pytest.warns(UserWarning): extractor.extract_first(element) == "a"
def test_invalid_xpath_expr(element, expr): extractor = XPathExtractor(expr) with pytest.raises(ExprError) as catch: extractor.extract(element) exc = catch.value assert exc.extractor is extractor assert isinstance(exc.exc, XPathEvalError)
def test_invalid_xpath_expr_by_extract(element, expr): extractor = XPathExtractor(expr) assert not extractor.built with pytest.raises(ExprError) as catch: extractor.extract(element) assert not extractor.built exc = catch.value assert exc.extractor is extractor assert isinstance(exc.exc, XPathError) assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))
def test_invalid_xpath_expr_by_XPathEvalError_from_extract(element, expr): extractor = XPathExtractor(expr) with pytest.raises(ExprError) as catch: extractor.extract(element) exc = catch.value assert exc.extractor is extractor # Third Party Library from lxml.etree import XPathEvalError assert isinstance(exc.exc, XPathEvalError) assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))
def test_field_overwrites_item_parameter_type_creation(stack_frame_support, item_property): with pytest.raises(SyntaxError) as catch: # fmt: off type("Parameter", (Item, ), {item_property: Field(XPathExtractor("./span[@class='name']")) }) # noqa: E950 # fmt: on exc = catch.value if stack_frame_support: assert exc.filename == __file__ assert exc.lineno == inspect.currentframe().f_lineno - 6 assert exc.offset == 8 assert (exc.text == """ type("Parameter", (Item,), {item_property: Field(XPathExtractor("./span[@class='name']"))}) # noqa: E950 """.strip()) else: assert exc.filename is None assert exc.lineno is None assert exc.offset is None assert ( exc.text == f"""{item_property}=Field(XPathExtractor("./span[@class='name']"))""" )
def test_invalid_xpath_expr(expr): with pytest.raises(ExprError) as catch: XPathExtractor(expr) exc = catch.value # Third Party Library from lxml.etree import XPathError assert isinstance(exc.exc, XPathError) assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))
def test_field_xpath_extract_result_not_list(element0, build_first): field = Field(XPathExtractor("normalize-space(//div[@class='title'])")) assert not field.built assert not field.extractor.built if build_first: field.build() assert field.built assert field.extractor.built assert field.extract(element0) == "Title 1" assert field.built assert field.extractor.built
def test_field_name_overwrite_item_parameter_type_creation(): with pytest.raises(SyntaxError) as catch: # fmt: off type("Parameter", (Item, ), {"name": Field(XPathExtractor("./span[@class='name']"))}) # fmt: on exc = catch.value assert exc.filename == __file__ assert exc.lineno == inspect.currentframe().f_lineno - 5 assert exc.offset == 8 assert ( exc.text == 'type("Parameter", (Item,), {"name": Field(XPathExtractor("./span[@class=\'name\']"))})' )
class Parameter(Item): name = Field( XPathExtractor("./span[@class='name']")) # noqa: B950, E701
class Channel(Item): title = Field(XPathExtractor("./title/text()")) link = Field(XPathExtractor("./link/text()")) description = Field(XPathExtractor("./description/text()")) language = Field(XPathExtractor("./language/text()")) publish_date = Field(XPathExtractor("./pubDate/text()")) last_build_date = Field(XPathExtractor("./lastBuildDate/text()")) docs = Field(XPathExtractor("./docs/text()")) generator = Field(XPathExtractor("./generator/text()")) managing_editor = Field(XPathExtractor("./managingEditor/text()")) web_master = Field(XPathExtractor("./webMaster/text()")) items = ChannelItem(XPathExtractor("./item"), is_many=True)
class ChannelItem(Item): title = Field(XPathExtractor("./title/text()"), default="") link = Field(XPathExtractor("./link/text()"), default="") description = Field(XPathExtractor("./description/text()")) publish_date = Field(XPathExtractor("./pubDate/text()")) guid = Field(XPathExtractor("./guid/text()"))
def test_complex_item_extract_xml_data(build_first): from lxml.etree import fromstring sample_rss_path = Path(__file__).parent / "assets" / "sample-rss-2.xml" text = sample_rss_path.read_text() element = fromstring(text) class ChannelItem(Item): title = Field(XPathExtractor("./title/text()"), default="") link = Field(XPathExtractor("./link/text()"), default="") description = Field(XPathExtractor("./description/text()")) publish_date = Field(XPathExtractor("./pubDate/text()")) guid = Field(XPathExtractor("./guid/text()")) class Channel(Item): title = Field(XPathExtractor("./title/text()")) link = Field(XPathExtractor("./link/text()")) description = Field(XPathExtractor("./description/text()")) language = Field(XPathExtractor("./language/text()")) publish_date = Field(XPathExtractor("./pubDate/text()")) last_build_date = Field(XPathExtractor("./lastBuildDate/text()")) docs = Field(XPathExtractor("./docs/text()")) generator = Field(XPathExtractor("./generator/text()")) managing_editor = Field(XPathExtractor("./managingEditor/text()")) web_master = Field(XPathExtractor("./webMaster/text()")) items = ChannelItem(XPathExtractor("./item"), is_many=True) items_result = [ { "title": "Star City", "link": "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp", "description": ("How do Americans get ready to work " "with Russians aboard the International Space Station? " "They take a crash course in culture, " "language and protocol at Russia's " '<a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.' ), "publish_date": "Tue, 03 Jun 2003 09:39:21 GMT", "guid": "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573", }, { "title": "", "link": "", "description": ( "Sky watchers in Europe, Asia, and parts of Alaska and Canada " "will experience a " '<a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">' # noqa: B950 "partial eclipse of the Sun" "</a> on Saturday, May 31st."), "publish_date": "Fri, 30 May 2003 11:06:42 GMT", "guid": "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572", }, { "title": "The Engine That Does More", "link": "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp", "description": ("Before man travels to Mars, " "NASA hopes to design new engines " "that will let us fly through the Solar System more quickly. " "The proposed VASIMR engine would do that."), "publish_date": "Tue, 27 May 2003 08:37:32 GMT", "guid": "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571", }, { "title": "Astronauts' Dirty Laundry", "link": "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp", "description": ("Compared to earlier spacecraft, " "the International Space Station has many luxuries, " "but laundry facilities are not one of them. " "Instead, astronauts have other options."), "publish_date": "Tue, 20 May 2003 08:56:02 GMT", "guid": "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570", }, ] item = ChannelItem(CSSExtractor("channel>item")) if build_first: item.build() assert item.extract(element) == items_result[0] item = ChannelItem(CSSExtractor("channel>item"), is_many=True) if build_first: item.build() assert item.extract(element) == items_result item = Channel(XPathExtractor("//channel")) if build_first: item.build() assert item.extract(element) == { "title": "Liftoff News", "link": "http://liftoff.msfc.nasa.gov/", "description": "Liftoff to Space Exploration.", "language": "en-us", "publish_date": "Tue, 10 Jun 2003 04:00:00 GMT", "last_build_date": "Tue, 10 Jun 2003 09:41:01 GMT", "docs": "http://blogs.law.harvard.edu/tech/rss", "generator": "Weblog Editor 2.0", "managing_editor": "*****@*****.**", "web_master": "*****@*****.**", "items": items_result, }
class Article(Item): title = Field(XPathExtractor("./div[@class='title']/text()")) content = Field(XPathExtractor("./div[@class='content']/text()"))
if not _missing_jsonpath else pytest.param( "Missing 'jsonpath-extractor'", marks=pytest.mark.skip() ), JSONPathRWExtractor(expr="boo") if not _missing_jsonpath_rw else pytest.param("Missing 'jsonpath-rw'", marks=pytest.mark.skip()), JSONPathRWExtExtractor(expr="boo") if not _missing_jsonpath_rw_ext else pytest.param( "Missing 'jsonpath-rw-ext'", marks=pytest.mark.skip() ), TextCSSExtractor(expr="div.class") if not _missing_cssselect else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()), XPathExtractor(expr="//div") if not _missing_lxml else pytest.param("Missing 'lxml'", marks=pytest.mark.skip()), ], ids=repr, ) def simple_extractor(request): return request.param def test_complex_extractor_is_extractor(complex_extractor): assert is_extractor(complex_extractor) def test_simple_extractor_is_extractor(simple_extractor): assert is_extractor(simple_extractor)
def test_xpath_result_not_list(element): extractor = XPathExtractor("normalize-space(//span)") assert extractor.extract(element) == ["a"] assert extractor.extract_first(element) == "a"
class Parameter(Item): name = Field(XPathExtractor("./span[@class='name']")) # type: ignore # noqa: E501, E701 # fmt: on exc = catch.value
def test_field_xpath_extract_result_not_list_conflict_with_is_many(element0): with pytest.warns(UserWarning): Field(XPathExtractor("normalize-space(//div[@class='title'])"), is_many=True).extract(element0)
def test_field_xpath_extract_result_not_list(element0): assert (Field( XPathExtractor("normalize-space(//div[@class='title'])")).extract( element0) == "Title 1")
@pytest.fixture( params=[ AttrCSSExtractor(expr="div.class", attr="id") if not _missing_cssselect else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()), CSSExtractor(expr="div.class") if not _missing_cssselect else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()), JSONPathExtractor(expr="boo") if not _missing_jsonpath else pytest.param("Missing 'jsonpath-extractor'", marks=pytest.mark.skip()), JSONPathRWExtractor(expr="boo") if not _missing_jsonpath_rw else pytest.param("Missing 'jsonpath-rw'", marks=pytest.mark.skip()), JSONPathRWExtExtractor(expr="boo") if not _missing_jsonpath_rw_ext else pytest.param("Missing 'jsonpath-rw-ext'", marks=pytest.mark.skip()), TextCSSExtractor(expr="div.class") if not _missing_cssselect else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()), XPathExtractor(expr="//div") if not _missing_lxml else pytest.param( "Missing 'lxml'", marks=pytest.mark.skip()), ], ids=repr, ) def simple_extractor(request): return request.param def test_complex_extractor_is_extractor(complex_extractor): assert is_extractor(complex_extractor) def test_simple_extractor_is_extractor(simple_extractor): assert is_extractor(simple_extractor)
def test_missing_lxml(): with pytest.raises(RuntimeError) as catch: XPathExtractor("//boo") assert "lxml" in str(catch.value)
class Parameter(Item): name = Field(XPathExtractor("./span[@class='name']")) # noqa: B950, E701 # fmt: on exc = catch.value