def test_find_exclude(self): """ Determine if excluding links removes the links """ for page in td.pages: seek = Links(text=page['text']) actual_list = seek.find(exclude=[{"class": re.compile("gb1")}]) self.assertEqual(len(actual_list), page['exclude_links']) actual_list = seek.find(exclude=[{"class": "gb1"}]) self.assertEqual(len(actual_list), page['exclude_links'])
def test_find_reverse_sort(self): """ Ensure reverse sort sorts before limiting the # of links """ for page in td.pages: seek = Links(text=page['text']) actual_list = seek.find(limit=5, reverse=True) self.assertEqual(len(actual_list), len(page['limit_reverse_find'])) for i, link in enumerate(actual_list): self.assertDictSame(link, page['limit_reverse_find'][i])
def test_find_limit(self): """ Check that the actual array with a limit matches the test data """ for page in td.pages: seek = Links(text=page['text']) actual_list = seek.find(limit=5) self.assertEqual(len(actual_list), len(page['limit_find'])) for i, link in enumerate(actual_list): self.assertDictSame(link, page['limit_find'][i])
def test_find_sort_by_href(self): """ Sorting by href produces proper results """ for page in td.pages: seek = Links(text=page['text']) actual_list = seek.find(limit=5, sort=lambda key: key['href'] or "") self.assertEqual(len(actual_list), len(page['limit_sort_href'])) for i, link in enumerate(actual_list): self.assertDictSame(link, page['limit_sort_href'][i])
def test_find(self): """ Test how grabbing the hyperlinks are aggregated """ seek = Links(self.url) # Each of these assertions should have their own test method. self.assertRaises(Exception, seek.find, filters=['href', 'style']) # ex: "test_find_hyperlink_bad_filter_param" self.assertRaises(Exception, seek.find, filters=25) self.assertEqual(len(seek.find(limit=5)), 5) self.assertEqual(len(seek.find(limit=1)), 1)
def test_find_number_of_links(self): """ Ensure expected number of links reflects actual number of links """ for page in td.pages: seek = Links(text=page['text']) self.assertEqual(len(seek.find()), page['num_links'])
def test_find_limit_param(self): """ How does find() handle the limit property """ seek = Links(self.url) self.assertEqual(len(seek.find(limit=5)), 5) self.assertEqual(len(seek.find(limit=1)), 1)
def test_soup_property(self): """ Getting the web page yields correct response""" seek = Links(self.url) self.assertIsInstance(seek._soup, bs4.BeautifulSoup)
def test_find_duplicates(self): """ Determine if removing duplicates works """ for page in td.pages: seek = Links(text=page['text']) actual_list = seek.find(duplicates=False) self.assertEqual(len(actual_list), page['duplicate_links'])
import re from linkGrabber import Links links = Links('http://google.com') gb = links.find(limit=4, duplicates=False, pretty=True) print(gb)
def scraper(url, level_target=0, to=None, sameroot=False, overwrite=False): level = 0 perlevel = defaultdict(list) perlevel[0].append(url) visited = defaultdict(int) visited[url] = True siteinfo = urlparse(url) total = defaultdict(int) # total per level finished = 0 while level <= level_target: if not len(perlevel[level]) > 0: level += 1 continue url_actual = perlevel[level].pop() msg = "[level:{}] [#:{}/{}]" idx = total[level] - len(perlevel[level]) log.info(msg.format(level, idx, total[level])) log.info(url_actual) page = None dirpath = os.path.join(to, str(level)) make_dir(dirpath) archivo = encoding_path(url_actual) for c in tablereplace: archivo = archivo.replace(c, "_") log.debug("using as filename:") archivo = os.path.join(dirpath, archivo) log.debug(archivo) try: from_file = InfoFile(archivo) txt_path = "".join([from_file.dirname, os.sep, from_file.name, ".txt"]) to_txt = InfoFile(txt_path) try: page = Links(href=url_actual) links = page.find(duplicates=False) except Exception, e: log.error("[FAIL] %s" % e) continue if not page: log.info("The page apparently does not have links.") continue # process url if not overwrite and os.path.exists(to_txt.path): log.info("[OK] Text file already exists") finished += 1 else: try: save_text(from_file, to_txt, page.response.content) log.info("[OK] Status: %s" % page.response.status_code) finished += 1 except Exception, e: log.error(e) log.error("[FAIL] Status: %s" % page.response.status_code) print("urls added: ", end="") for _url in links: if "href" not in _url: continue _url = urljoin(url, _url["href"]) if not visited[_url]: visited[_url] = True link = urlparse(_url) if sameroot: if link.netloc != siteinfo.netloc: continue perlevel[level + 1].append(_url) total[level + 1] += 1 print("*", end="") print()
def test_page(self): """ Getting the web page yields correct response""" seek = Links(self.url) self.assertIsInstance(seek._page(), bs4.BeautifulSoup)