Example #1
0
def test_get_online_wikis(request_mock):
    """Parse names of available online wikis."""
    set_wikimedia_pages(request_mock)

    wct = ww.CorporaTracker(verbose=False)
    assert "enwiki" in wct.online_wikis
    assert len(wct.online_wikis) == 5
Example #2
0
def test_is_online_problem(request_mock):
    """Instantiation correctly throws exceptions for unexpected connectivity issue."""
    request_mock.get("http://dumps.wikimedia.org",
                     exc=requests.exceptions.HTTPError)

    with pytest.raises(requests.exceptions.HTTPError):
        ww.CorporaTracker(url="http://dumps.wikimedia.org", verbose=False)
Example #3
0
def test_is_online_offline(request_mock):
    """Instantiation correctly detects offline status."""
    request_mock.get("http://dumps.wikimedia.org",
                     exc=requests.exceptions.ConnectTimeout)

    wct = ww.CorporaTracker(verbose=False)
    assert not wct.is_online()
Example #4
0
def test_local_dir_bad_path(fs):
    """Permissions check, catches nonsense path."""
    fs.create_dir("/data1", perm_bits=0o777)
    fs.create_dir("/data2", perm_bits=0o777)

    with pytest.raises(FileNotFoundError):
        ww.CorporaTracker(local_dirs=["/data1", "foo"],
                          online=False,
                          verbose=False)
Example #5
0
def test_local_dir_perm_wrong(fs):
    """Permissions check, catches dir without R/W permissions."""
    fs.create_dir("/data1", perm_bits=0o777)
    fs.create_dir("/data2", perm_bits=0o444)  # read-only

    with pytest.raises(PermissionError):
        ww.CorporaTracker(local_dirs=["/data1", "/data2"],
                          online=False,
                          verbose=False)
Example #6
0
def test_local_dir_perm_ok(fs):
    """Permissions check, handles dirs with R/W permissions."""
    fs.create_dir("/data1", perm_bits=0o777)
    fs.create_dir("/data2", perm_bits=0o777)

    wct = ww.CorporaTracker(local_dirs=["/data1", "/data2"],
                            online=False,
                            verbose=False)
    assert not wct.online
Example #7
0
def test_get_online_wikis_mid_dump(request_mock):
    """Parsing the list of online wikis must handle the additional text during dump days."""
    set_wikimedia_pages(
        request_mock,
        wiki_list_file="./tests/data/wikimedia_wikilist_middump.html")

    wct = ww.CorporaTracker(verbose=False)
    assert "enwiki" in wct.online_wikis
    assert len(wct.online_wikis) == 5
Example #8
0
def test_get_offline_wikis(fs):
    """Local wiki names are found."""
    fs.create_file("/data1/frwiki-20201020-md5sums.txt")
    fs.create_file("/data2/enwiki-20201020-md5sums.txt")

    wct = ww.CorporaTracker(local_dirs=["/data1", "/data2"],
                            online=False,
                            verbose=False)
    assert len(wct.list_local_wikis()) == 2
    assert "enwiki" in wct.list_local_wikis()
Example #9
0
def test_wiki_file_identification(fs):
    """Parsing of wiki file names catches all known names."""
    fs.add_real_file("./tests/data/local_filelist_enwiki.txt")
    with open("./tests/data/local_filelist_enwiki.txt", "rt") as filelist:
        for filename in filelist:
            fs.create_file("/data/" + filename.rstrip())
    wct = ww.CorporaTracker(local_dirs=["/data"], online=False, verbose=False)
    assert wct.get_local_file_count() == 1836
    assert len(wct.list_local_wikis()) == 1
    assert len(wct.list_unknown_files()) == 3
    assert len(wct.list_unknown_files("enwiki")) == 2
    assert len(wct.list_unknown_files("enwiki", "20201001")) == 1
    assert len(wct.list_local_checksum_files("enwiki", "20201001")) == 2
Example #10
0
def test_file_counts(fs):
    """Scan of local directories counts files correctly."""
    fs.create_file("/data1/enwiki-20201001-md5sums.txt")
    fs.create_file("/data1/enwiki-20201020-md5sums.txt")
    fs.create_file("/data2/frwiki-20201020-md5sums.txt")
    fs.create_file("/data2/enwiki-20201020-md5sums.txt")
    fs.create_file("/data2/frwiki-20201020-foo.txt")

    wct = ww.CorporaTracker(local_dirs=["/data1", "/data2"],
                            online=False,
                            verbose=False)
    assert wct.get_local_file_count("enwiki") == 3
    assert wct.get_local_file_count("enwiki", "20201020") == 2
Example #11
0
def test_local_dir_scan(fs):
    """Scan of local directories works across directories."""
    fs.create_file("/data1/enwiki-20201001-md5sums.txt")
    fs.create_file("/data1/enwiki-20201020-md5sums.txt")
    fs.create_file("/data2/frwiki-20201020-md5sums.txt")
    fs.create_file("/data2/enwiki-20201020-md5sums.txt")

    wct = ww.CorporaTracker(local_dirs=["/data1", "/data2"],
                            online=False,
                            verbose=False)
    assert len(wct.list_local_wikis()) == 2
    assert len(wct.list_local_dumps()) == 3
    assert len(wct.list_local_dumps("enwiki")) == 2
    assert wct.get_local_file_count("enwiki") == 3
    assert wct.get_local_file_count("enwiki", "20201020") == 2
Example #12
0
def test_is_online_online(request_mock):
    """Instantiation correctly detects online status."""
    set_wikimedia_pages(request_mock)

    wct = ww.CorporaTracker(verbose=False)
    assert wct.is_online()