def download_resources(self, query, directory, filename_model=None, ids=[], index=1, ids_digit_len=[], index_digit_len=0, duplicate_check=False): self._open() if not self.request: return resources = search_in_html(self.html, query, self.url) for url in resources: rg = ResourceGrabber(url) rg.download(directory, filename_model=filename_model, ids=ids, index=index, ids_digit_len=ids_digit_len, index_digit_len=ids_digit_len, duplicate_check=duplicate_check)
def get_internal_links(self, *args, **kwargs): self._open() if not self.request: return level = kwargs.get('level', 0) if self.request.status_code >=200 and self.request.status_code<300: links = search_in_html(self.html, args[level], self.url) for link in links: rg = ResourceGrabber(link) if len(args)>level+1: for inner_link in rg.get_internal_links(*args, level=level+1): yield inner_link else: yield link
def test_relative_urls(self): self.assertEqual(list(search_in_html(self.html, 'div.relLinks a', 'http://foofiles.org/')), ['http://foofiles.org/text1.txt', 'http://foofiles.org/text2.txt'])
def test_multiple_filter(self): self.assertEqual(list(search_in_html(self.html, 'div.links a')), ['http://foofiles.org/text1.txt', 'http://foofiles.org/text2.txt'])
def test_basic_filter_textual(self): self.assertEqual(list(search_in_html(self.html, 'div:eq(3)'))[0], 'http://fooimages.org/image1.png')
def test_basic_filter_ahref(self): self.assertEqual(list(search_in_html(self.html, 'div.links a'))[0], 'http://foofiles.org/text1.txt')