Beispiel #1
0
 def test_resolve_pattern_two_args(self):
     expected = [
         "https://example.com/search?page=2",
         "https://example.com/search?page=3",
     ]
     actual = list(
         resolve_url_pattern("https://example.com/search?page={2, 4}"))
     self.assertCountEqual(expected, actual)
Beispiel #2
0
 def test_resolve_pattern_three_args(self):
     expected = [
         "https://example.com/search?offset=20",
         "https://example.com/search?offset=40",
         "https://example.com/search?offset=60",
     ]
     actual = list(
         resolve_url_pattern(
             "https://example.com/search?offset={20, 80, 20}"))
     self.assertCountEqual(expected, actual)
Beispiel #3
0
 def test_resolve_pattern_one_arg(self):
     expected = [
         "https://example.com/items?item=0",
         "https://example.com/items?item=1",
         "https://example.com/items?item=2",
         "https://example.com/items?item=3",
     ]
     actual = list(
         resolve_url_pattern("https://example.com/items?item={4}"))
     self.assertCountEqual(expected, actual)
Beispiel #4
0
 def test_resolve_empty(self):
     expected = []
     actual = list(resolve_url_pattern(""))
     self.assertCountEqual(expected, actual)
Beispiel #5
0
 def test_resolve_multiple_patterns(self):
     expected = []
     actual = list(resolve_url_pattern("https://example.com/?a={3}&b={7}"))
     self.assertCountEqual(expected, actual)
Beispiel #6
0
 def test_resolve_pattern_no_args(self):
     expected = []
     actual = list(resolve_url_pattern("https://example.com/{}"))
     self.assertCountEqual(expected, actual)
Beispiel #7
0
 def test_resolve_no_pattern(self):
     expected = ["http://example.com/hello"]
     actual = list(resolve_url_pattern("http://example.com/hello"))
     self.assertCountEqual(expected, actual)
Beispiel #8
0
 def test_resolve_no_http(self):
     expected = []
     actual = list(resolve_url_pattern("example.com"))
     self.assertCountEqual(expected, actual)
Beispiel #9
0
def parse_pages(url_pattern: str,
                selectors: List[str]) -> Iterator[List[List[str]]]:
    for url in resolve_url_pattern(url_pattern):
        page = requests.get(url)
        yield list(parse_html(page.content, selectors))