Python XPathExtractorの例、data_extractor.lxml.XPathExtractor Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_lxml.py プロジェクト: NyntoFive/data_extractor

def test_xpath_result_not_list(element):
    extractor = XPathExtractor("normalize-space(//span)")

    assert extractor.extract(element) == "a"

    with pytest.warns(UserWarning):
        extractor.extract_first(element) == "a"

コード例 #2

0

ファイルを表示

ファイル: test_lxml.py プロジェクト: NyntoFive/data_extractor

def test_invalid_xpath_expr(element, expr):
    extractor = XPathExtractor(expr)
    with pytest.raises(ExprError) as catch:
        extractor.extract(element)

    exc = catch.value
    assert exc.extractor is extractor
    assert isinstance(exc.exc, XPathEvalError)

コード例 #3

0

ファイルを表示

ファイル: test_lxml.py プロジェクト: walison17/data_extractor

def test_invalid_xpath_expr_by_extract(element, expr):
    extractor = XPathExtractor(expr)
    assert not extractor.built
    with pytest.raises(ExprError) as catch:
        extractor.extract(element)

    assert not extractor.built
    exc = catch.value
    assert exc.extractor is extractor
    assert isinstance(exc.exc, XPathError)
    assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))

コード例 #4

0

ファイルを表示

ファイル: test_lxml.py プロジェクト: linw1995/data_extractor

def test_invalid_xpath_expr_by_XPathEvalError_from_extract(element, expr):
    extractor = XPathExtractor(expr)
    with pytest.raises(ExprError) as catch:
        extractor.extract(element)

    exc = catch.value
    assert exc.extractor is extractor
    # Third Party Library
    from lxml.etree import XPathEvalError

    assert isinstance(exc.exc, XPathEvalError)
    assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))

コード例 #5

0

ファイルを表示

def test_field_overwrites_item_parameter_type_creation(stack_frame_support,
                                                       item_property):
    with pytest.raises(SyntaxError) as catch:
        # fmt: off
        type("Parameter", (Item, ),
             {item_property: Field(XPathExtractor("./span[@class='name']"))
              })  # noqa: E950
        # fmt: on

    exc = catch.value
    if stack_frame_support:
        assert exc.filename == __file__
        assert exc.lineno == inspect.currentframe().f_lineno - 6
        assert exc.offset == 8
        assert (exc.text == """
        type("Parameter", (Item,), {item_property: Field(XPathExtractor("./span[@class='name']"))})  # noqa: E950
        """.strip())
    else:
        assert exc.filename is None
        assert exc.lineno is None
        assert exc.offset is None
        assert (
            exc.text ==
            f"""{item_property}=Field(XPathExtractor("./span[@class='name']"))"""
        )

コード例 #6

0

ファイルを表示

ファイル: test_lxml.py プロジェクト: linw1995/data_extractor

def test_invalid_xpath_expr(expr):
    with pytest.raises(ExprError) as catch:
        XPathExtractor(expr)

    exc = catch.value
    # Third Party Library
    from lxml.etree import XPathError

    assert isinstance(exc.exc, XPathError)
    assert re.match(r"ExprError with .+? raised by .+? extracting", str(exc))

コード例 #7

0

ファイルを表示

def test_field_xpath_extract_result_not_list(element0, build_first):
    field = Field(XPathExtractor("normalize-space(//div[@class='title'])"))
    assert not field.built
    assert not field.extractor.built
    if build_first:
        field.build()
        assert field.built
        assert field.extractor.built

    assert field.extract(element0) == "Title 1"
    assert field.built
    assert field.extractor.built

コード例 #8

0

ファイルを表示

ファイル: test_item.py プロジェクト: NyntoFive/data_extractor

def test_field_name_overwrite_item_parameter_type_creation():
    with pytest.raises(SyntaxError) as catch:
        # fmt: off
        type("Parameter", (Item, ),
             {"name": Field(XPathExtractor("./span[@class='name']"))})
        # fmt: on

    exc = catch.value
    assert exc.filename == __file__
    assert exc.lineno == inspect.currentframe().f_lineno - 5
    assert exc.offset == 8
    assert (
        exc.text ==
        'type("Parameter", (Item,), {"name": Field(XPathExtractor("./span[@class=\'name\']"))})'
    )

コード例 #9

0

ファイルを表示

 class Parameter(Item):
     name = Field(
         XPathExtractor("./span[@class='name']"))  # noqa: B950, E701

コード例 #10

0

ファイルを表示

    class Channel(Item):
        title = Field(XPathExtractor("./title/text()"))
        link = Field(XPathExtractor("./link/text()"))
        description = Field(XPathExtractor("./description/text()"))
        language = Field(XPathExtractor("./language/text()"))
        publish_date = Field(XPathExtractor("./pubDate/text()"))
        last_build_date = Field(XPathExtractor("./lastBuildDate/text()"))
        docs = Field(XPathExtractor("./docs/text()"))
        generator = Field(XPathExtractor("./generator/text()"))
        managing_editor = Field(XPathExtractor("./managingEditor/text()"))
        web_master = Field(XPathExtractor("./webMaster/text()"))

        items = ChannelItem(XPathExtractor("./item"), is_many=True)

コード例 #11

0

ファイルを表示

 class ChannelItem(Item):
     title = Field(XPathExtractor("./title/text()"), default="")
     link = Field(XPathExtractor("./link/text()"), default="")
     description = Field(XPathExtractor("./description/text()"))
     publish_date = Field(XPathExtractor("./pubDate/text()"))
     guid = Field(XPathExtractor("./guid/text()"))

コード例 #12

0

ファイルを表示

def test_complex_item_extract_xml_data(build_first):
    from lxml.etree import fromstring

    sample_rss_path = Path(__file__).parent / "assets" / "sample-rss-2.xml"
    text = sample_rss_path.read_text()
    element = fromstring(text)

    class ChannelItem(Item):
        title = Field(XPathExtractor("./title/text()"), default="")
        link = Field(XPathExtractor("./link/text()"), default="")
        description = Field(XPathExtractor("./description/text()"))
        publish_date = Field(XPathExtractor("./pubDate/text()"))
        guid = Field(XPathExtractor("./guid/text()"))

    class Channel(Item):
        title = Field(XPathExtractor("./title/text()"))
        link = Field(XPathExtractor("./link/text()"))
        description = Field(XPathExtractor("./description/text()"))
        language = Field(XPathExtractor("./language/text()"))
        publish_date = Field(XPathExtractor("./pubDate/text()"))
        last_build_date = Field(XPathExtractor("./lastBuildDate/text()"))
        docs = Field(XPathExtractor("./docs/text()"))
        generator = Field(XPathExtractor("./generator/text()"))
        managing_editor = Field(XPathExtractor("./managingEditor/text()"))
        web_master = Field(XPathExtractor("./webMaster/text()"))

        items = ChannelItem(XPathExtractor("./item"), is_many=True)

    items_result = [
        {
            "title":
            "Star City",
            "link":
            "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp",
            "description":
            ("How do Americans get ready to work "
             "with Russians aboard the International Space Station? "
             "They take a crash course in culture, "
             "language and protocol at Russia's "
             '<a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.'
             ),
            "publish_date":
            "Tue, 03 Jun 2003 09:39:21 GMT",
            "guid":
            "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573",
        },
        {
            "title":
            "",
            "link":
            "",
            "description": (
                "Sky watchers in Europe, Asia, and parts of Alaska and Canada "
                "will experience a "
                '<a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">'  # noqa: B950
                "partial eclipse of the Sun"
                "</a> on Saturday, May 31st."),
            "publish_date":
            "Fri, 30 May 2003 11:06:42 GMT",
            "guid":
            "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572",
        },
        {
            "title":
            "The Engine That Does More",
            "link":
            "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp",
            "description":
            ("Before man travels to Mars, "
             "NASA hopes to design new engines "
             "that will let us fly through the Solar System more quickly.  "
             "The proposed VASIMR engine would do that."),
            "publish_date":
            "Tue, 27 May 2003 08:37:32 GMT",
            "guid":
            "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571",
        },
        {
            "title":
            "Astronauts' Dirty Laundry",
            "link":
            "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp",
            "description":
            ("Compared to earlier spacecraft, "
             "the International Space Station has many luxuries, "
             "but laundry facilities are not one of them.  "
             "Instead, astronauts have other options."),
            "publish_date":
            "Tue, 20 May 2003 08:56:02 GMT",
            "guid":
            "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570",
        },
    ]
    item = ChannelItem(CSSExtractor("channel>item"))
    if build_first:
        item.build()
    assert item.extract(element) == items_result[0]

    item = ChannelItem(CSSExtractor("channel>item"), is_many=True)
    if build_first:
        item.build()
    assert item.extract(element) == items_result

    item = Channel(XPathExtractor("//channel"))
    if build_first:
        item.build()
    assert item.extract(element) == {
        "title": "Liftoff News",
        "link": "http://liftoff.msfc.nasa.gov/",
        "description": "Liftoff to Space Exploration.",
        "language": "en-us",
        "publish_date": "Tue, 10 Jun 2003 04:00:00 GMT",
        "last_build_date": "Tue, 10 Jun 2003 09:41:01 GMT",
        "docs": "http://blogs.law.harvard.edu/tech/rss",
        "generator": "Weblog Editor 2.0",
        "managing_editor": "*****@*****.**",
        "web_master": "*****@*****.**",
        "items": items_result,
    }

コード例 #13

0

ファイルを表示

 class Article(Item):
     title = Field(XPathExtractor("./div[@class='title']/text()"))
     content = Field(XPathExtractor("./div[@class='content']/text()"))

コード例 #14

0

ファイルを表示

ファイル: test_utils.py プロジェクト: x0rzkov/data_extractor

        if not _missing_jsonpath
        else pytest.param(
            "Missing 'jsonpath-extractor'", marks=pytest.mark.skip()
        ),
        JSONPathRWExtractor(expr="boo")
        if not _missing_jsonpath_rw
        else pytest.param("Missing 'jsonpath-rw'", marks=pytest.mark.skip()),
        JSONPathRWExtExtractor(expr="boo")
        if not _missing_jsonpath_rw_ext
        else pytest.param(
            "Missing 'jsonpath-rw-ext'", marks=pytest.mark.skip()
        ),
        TextCSSExtractor(expr="div.class")
        if not _missing_cssselect
        else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()),
        XPathExtractor(expr="//div")
        if not _missing_lxml
        else pytest.param("Missing 'lxml'", marks=pytest.mark.skip()),
    ],
    ids=repr,
)
def simple_extractor(request):
    return request.param


def test_complex_extractor_is_extractor(complex_extractor):
    assert is_extractor(complex_extractor)


def test_simple_extractor_is_extractor(simple_extractor):
    assert is_extractor(simple_extractor)

コード例 #15

0

ファイルを表示

ファイル: test_lxml.py プロジェクト: linw1995/data_extractor

def test_xpath_result_not_list(element):
    extractor = XPathExtractor("normalize-space(//span)")
    assert extractor.extract(element) == ["a"]
    assert extractor.extract_first(element) == "a"

コード例 #16

0

ファイルを表示

ファイル: test_item.py プロジェクト: linw1995/data_extractor

        class Parameter(Item): name = Field(XPathExtractor("./span[@class='name']"))  # type: ignore # noqa: E501, E701
        # fmt: on

    exc = catch.value

コード例 #17

0

ファイルを表示

ファイル: test_item.py プロジェクト: NyntoFive/data_extractor

def test_field_xpath_extract_result_not_list_conflict_with_is_many(element0):
    with pytest.warns(UserWarning):
        Field(XPathExtractor("normalize-space(//div[@class='title'])"),
              is_many=True).extract(element0)

コード例 #18

0

ファイルを表示

ファイル: test_item.py プロジェクト: NyntoFive/data_extractor

def test_field_xpath_extract_result_not_list(element0):
    assert (Field(
        XPathExtractor("normalize-space(//div[@class='title'])")).extract(
            element0) == "Title 1")

コード例 #19

0

ファイルを表示

ファイル: test_utils.py プロジェクト: linw1995/data_extractor

@pytest.fixture(
    params=[
        AttrCSSExtractor(expr="div.class", attr="id") if not _missing_cssselect
        else pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()),
        CSSExtractor(expr="div.class") if not _missing_cssselect else
        pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()),
        JSONPathExtractor(expr="boo") if not _missing_jsonpath else
        pytest.param("Missing 'jsonpath-extractor'", marks=pytest.mark.skip()),
        JSONPathRWExtractor(expr="boo") if not _missing_jsonpath_rw else
        pytest.param("Missing 'jsonpath-rw'", marks=pytest.mark.skip()),
        JSONPathRWExtExtractor(expr="boo") if not _missing_jsonpath_rw_ext else
        pytest.param("Missing 'jsonpath-rw-ext'", marks=pytest.mark.skip()),
        TextCSSExtractor(expr="div.class") if not _missing_cssselect else
        pytest.param("Missing 'cssselect'", marks=pytest.mark.skip()),
        XPathExtractor(expr="//div") if not _missing_lxml else pytest.param(
            "Missing 'lxml'", marks=pytest.mark.skip()),
    ],
    ids=repr,
)
def simple_extractor(request):
    return request.param


def test_complex_extractor_is_extractor(complex_extractor):
    assert is_extractor(complex_extractor)


def test_simple_extractor_is_extractor(simple_extractor):
    assert is_extractor(simple_extractor)

コード例 #20

0

ファイルを表示

ファイル: test_utils.py プロジェクト: linw1995/data_extractor

def test_missing_lxml():
    with pytest.raises(RuntimeError) as catch:
        XPathExtractor("//boo")

    assert "lxml" in str(catch.value)

コード例 #21

0

ファイルを表示

ファイル: test_item.py プロジェクト: x0rzkov/data_extractor

        class Parameter(Item): name = Field(XPathExtractor("./span[@class='name']"))  # noqa: B950, E701
        # fmt: on

    exc = catch.value