def test_parse_url_invalid_req_cols(input_df): expected_error = ValueError( "Given req_cols must be subset of %s" % (["hostname", "subdomain", "domain", "suffix"])) with pytest.raises(ValueError) as actual_error: dns.parse_url(input_df["url"], req_cols={"test"}) assert actual_error == expected_error
def test_parse_url(input_df): expected_output_df = DataFrame({ "domain": [ "google", "gmail", "github", "pydata", "worldbank", "waiterrant", "cnn", "cnn", "cnn", "news", "news", "news", "sbcglobal", "akamaitechnologies", ], "suffix": [ "com", "com", "com", "org", "org.kg", "blogspot.com", "com.ac", "ac", "com", "uk", "co.uk", "co.uk", "net", "com", ], }) output_df = dns.parse_url(input_df["url"], req_cols={"domain", "suffix"}) for col in expected_output_df.columns: assert expected_output_df[col].equals(output_df[col])
def test2_parse_url(input_df): expected_output_df = DataFrame({ "hostname": [ "www.google.com", "gmail.com", "github.com", "pandas.pydata.org", "www.worldbank.org.kg", "waiterrant.blogspot.com", "forums.news.cnn.com.ac", "forums.news.cnn.ac", "b.cnn.com", "a.news.uk", "a.news.co.uk", "a.news.co.uk", "107-193-100-2.lightspeed.cicril.sbcglobal.net", "a23-44-13-2.deploy.static.akamaitechnologies.com", ], "subdomain": [ "www", "", "", "pandas", "www", "", "forums.news", "forums.news", "b", "a", "a", "a", "107-193-100-2.lightspeed.cicril", "a23-44-13-2.deploy.static", ], "domain": [ "google", "gmail", "github", "pydata", "worldbank", "waiterrant", "cnn", "cnn", "cnn", "news", "news", "news", "sbcglobal", "akamaitechnologies", ], "suffix": [ "com", "com", "com", "org", "org.kg", "blogspot.com", "com.ac", "ac", "com", "uk", "co.uk", "co.uk", "net", "com", ], }) output_df = dns.parse_url(input_df["url"]) assert expected_output_df.equals(output_df)