def test_no_wellformed(testpath):
    """Test scraper without well-formed check."""
    (_, tmppath) = tempfile.mkstemp()
    xml = """<?xml version="1.0" encoding="UTF-8" ?>
              <a>åäö</a>""".encode("utf-8")
    tmppath = os.path.join(testpath, "valid__.csv")
    with open(tmppath, "wb") as file_:
        file_.write(xml)
    scraper = LxmlScraper(tmppath, False)
    scraper.scrape_file()
    assert partial_message_included("Skipping scraper", scraper.messages())
    assert scraper.well_formed is None
def test_forced_filetype(filename, result_dict, filetype, evaluate_scraper):
    """
    Test using user-supplied MIME-types and versions.
    """
    correct = force_correct_filetype(filename, result_dict, filetype,
                                     ["(:unav)"])

    params = {
        "mimetype": filetype["given_mimetype"],
        "version": filetype["given_version"]
    }
    scraper = LxmlScraper(correct.filename, True, params)
    scraper.scrape_file()

    evaluate_scraper(scraper, correct)
def test_xml_encoding(testpath, file_encoding):
    """Test that encoding check from XML header works."""
    enc_match = {
        "latin_1": u"ISO-8859-15",
        "utf_8": "UTF-8",
        "utf_16": "UTF-16"
    }
    xml = """<?xml version="1.0" encoding="{}" ?>
              <a>åäö</a>""".format(enc_match[file_encoding])
    tmppath = os.path.join(testpath, "valid__.csv")
    with open(tmppath, "wb") as file_:
        file_.write(xml.encode(file_encoding))

    scraper = LxmlScraper(tmppath, "text/xml")
    scraper.scrape_file()
    #    assert scraper.streams[0]["charset"] == enc_match[file_encoding]
    assert scraper.well_formed
Esempio n. 4
0
def test_xml_encoding(testpath, file_encoding):
    """
    Test that encoding check from XML header works.

    :file_encoding: File character encoding
    """
    enc_match = {
        "latin_1": u"ISO-8859-15",
        "utf_8": "UTF-8",
        "utf_16": "UTF-16"
    }
    xml = """<?xml version="1.0" encoding="{}" ?>
              <a>åäö</a>""".format(enc_match[file_encoding])
    tmppath = os.path.join(testpath, "valid__.csv")
    with io.open(tmppath, "wb") as file_:
        file_.write(xml.encode(file_encoding))

    scraper = LxmlScraper(filename=tmppath,
                          mimetype="text/xml",
                          params={"charset": enc_match[file_encoding]})
    scraper.scrape_file()
    assert scraper.well_formed
Esempio n. 5
0
def test_is_supported_deny():
    """Test is_supported method for html 5.0 files."""
    mime = "text/html"
    ver = "5.0"
    assert LxmlScraper.is_supported(mime, ver, True)
    assert LxmlScraper.is_supported(mime, None, True)
    assert not LxmlScraper.is_supported(mime, ver, True,
                                        {"schematron": "test"})
    assert not LxmlScraper.is_supported(mime, ver, False)
    assert not LxmlScraper.is_supported(mime, "foo", True)
    assert not LxmlScraper.is_supported("foo", ver, True)
Esempio n. 6
0
def test_charset(filename, mimetype, charset, well_formed):
    """
    Test charset parameter.

    :filename: Test file name
    :mimetype: File MIME type
    :charset: File character encoding
    :well_formed: Expected result of well-formedness
    """
    params = {"charset": charset}
    scraper = LxmlScraper(filename=filename, mimetype=mimetype, params=params)
    scraper.scrape_file()
    assert scraper.well_formed == well_formed
    if charset:
        if well_formed:
            assert not scraper.errors()
        else:
            assert partial_message_included("Found encoding declaration UTF-8",
                                            scraper.errors())
    else:
        assert partial_message_included("encoding not defined",
                                        scraper.errors())