Exemple #1
0
 def test_find_exclude(self):
     """ Determine if excluding links removes the links """
     for page in td.pages:
         seek = Links(text=page['text'])
         actual_list = seek.find(exclude=[{"class": re.compile("gb1")}])
         self.assertEqual(len(actual_list), page['exclude_links'])
         actual_list = seek.find(exclude=[{"class": "gb1"}])
         self.assertEqual(len(actual_list), page['exclude_links'])
Exemple #2
0
 def test_find_reverse_sort(self):
     """ Ensure reverse sort sorts before limiting the # of links """
     for page in td.pages:
         seek = Links(text=page['text'])
         actual_list = seek.find(limit=5, reverse=True)
         self.assertEqual(len(actual_list), len(page['limit_reverse_find']))
         for i, link in enumerate(actual_list):
             self.assertDictSame(link, page['limit_reverse_find'][i])
Exemple #3
0
 def test_find_limit(self):
     """ Check that the actual array with a limit matches the test data """
     for page in td.pages:
         seek = Links(text=page['text'])
         actual_list = seek.find(limit=5)
         self.assertEqual(len(actual_list), len(page['limit_find']))
         for i, link in enumerate(actual_list):
             self.assertDictSame(link, page['limit_find'][i])
Exemple #4
0
 def test_find_exclude(self):
     """ Determine if excluding links removes the links """
     for page in td.pages:
         seek = Links(text=page['text'])
         actual_list = seek.find(exclude=[{"class": re.compile("gb1")}])
         self.assertEqual(len(actual_list), page['exclude_links'])
         actual_list = seek.find(exclude=[{"class": "gb1"}])
         self.assertEqual(len(actual_list), page['exclude_links'])
Exemple #5
0
 def test_find_sort_by_href(self):
     """ Sorting by href produces proper results """
     for page in td.pages:
         seek = Links(text=page['text'])
         actual_list = seek.find(limit=5, sort=lambda key: key['href'] or "")
         self.assertEqual(len(actual_list), len(page['limit_sort_href']))
         for i, link in enumerate(actual_list):
             self.assertDictSame(link, page['limit_sort_href'][i])
Exemple #6
0
 def test_find_sort_by_href(self):
     """ Sorting by href produces proper results """
     for page in td.pages:
         seek = Links(text=page['text'])
         actual_list = seek.find(limit=5, sort=lambda key: key['href'] or "")
         self.assertEqual(len(actual_list), len(page['limit_sort_href']))
         for i, link in enumerate(actual_list):
             self.assertDictSame(link, page['limit_sort_href'][i])
Exemple #7
0
 def test_find_reverse_sort(self):
     """ Ensure reverse sort sorts before limiting the # of links """
     for page in td.pages:
         seek = Links(text=page['text'])
         actual_list = seek.find(limit=5, reverse=True)
         self.assertEqual(len(actual_list), len(page['limit_reverse_find']))
         for i, link in enumerate(actual_list):
             self.assertDictSame(link, page['limit_reverse_find'][i])
Exemple #8
0
 def test_find_limit(self):
     """ Check that the actual array with a limit matches the test data """
     for page in td.pages:
         seek = Links(text=page['text'])
         actual_list = seek.find(limit=5)
         self.assertEqual(len(actual_list), len(page['limit_find']))
         for i, link in enumerate(actual_list):
             self.assertDictSame(link, page['limit_find'][i])
 def test_find(self):
     """ Test how grabbing the hyperlinks are aggregated """
     seek = Links(self.url)
     # Each of these assertions should have their own test method.
     self.assertRaises(Exception, seek.find, filters=['href', 'style'])
     # ex: "test_find_hyperlink_bad_filter_param"
     self.assertRaises(Exception, seek.find, filters=25)
     self.assertEqual(len(seek.find(limit=5)), 5)
     self.assertEqual(len(seek.find(limit=1)), 1)
Exemple #10
0
 def test_find_number_of_links(self):
     """ Ensure expected number of links reflects actual number of links """
     for page in td.pages:
         seek = Links(text=page['text'])
         self.assertEqual(len(seek.find()), page['num_links'])
Exemple #11
0
 def test_find_limit_param(self):
     """ How does find() handle the limit property """
     seek = Links(self.url)
     self.assertEqual(len(seek.find(limit=5)), 5)
     self.assertEqual(len(seek.find(limit=1)), 1)
Exemple #12
0
 def test_soup_property(self):
     """ Getting the web page yields correct response"""
     seek = Links(self.url)
     self.assertIsInstance(seek._soup, bs4.BeautifulSoup)
Exemple #13
0
 def test_find_duplicates(self):
     """ Determine if removing duplicates works """
     for page in td.pages:
         seek = Links(text=page['text'])
         actual_list = seek.find(duplicates=False)
         self.assertEqual(len(actual_list), page['duplicate_links'])
Exemple #14
0
import re
from linkGrabber import Links

links = Links('http://google.com')
gb = links.find(limit=4, duplicates=False, pretty=True)
print(gb)
Exemple #15
0
 def test_find_number_of_links(self):
     """ Ensure expected number of links reflects actual number of links """
     for page in td.pages:
         seek = Links(text=page['text'])
         self.assertEqual(len(seek.find()), page['num_links'])
Exemple #16
0
 def test_find_limit_param(self):
     """ How does find() handle the limit property """
     seek = Links(self.url)
     self.assertEqual(len(seek.find(limit=5)), 5)
     self.assertEqual(len(seek.find(limit=1)), 1)
Exemple #17
0
 def test_find_duplicates(self):
     """ Determine if removing duplicates works """
     for page in td.pages:
         seek = Links(text=page['text'])
         actual_list = seek.find(duplicates=False)
         self.assertEqual(len(actual_list), page['duplicate_links'])
Exemple #18
0
def scraper(url, level_target=0, to=None, sameroot=False, overwrite=False):
    level = 0
    perlevel = defaultdict(list)
    perlevel[0].append(url)

    visited = defaultdict(int)
    visited[url] = True

    siteinfo = urlparse(url)
    total = defaultdict(int)  # total per level
    finished = 0

    while level <= level_target:
        if not len(perlevel[level]) > 0:
            level += 1
            continue
        url_actual = perlevel[level].pop()
        msg = "[level:{}] [#:{}/{}]"
        idx = total[level] - len(perlevel[level])
        log.info(msg.format(level, idx, total[level]))
        log.info(url_actual)

        page = None
        dirpath = os.path.join(to, str(level))
        make_dir(dirpath)
        archivo = encoding_path(url_actual)

        for c in tablereplace:
            archivo = archivo.replace(c, "_")
        log.debug("using as filename:")
        archivo = os.path.join(dirpath, archivo)
        log.debug(archivo)

        try:
            from_file = InfoFile(archivo)
            txt_path = "".join([from_file.dirname, os.sep, from_file.name, ".txt"])
            to_txt = InfoFile(txt_path)

            try:
                page = Links(href=url_actual)
                links = page.find(duplicates=False)
            except Exception, e:
                log.error("[FAIL] %s" % e)
                continue

            if not page:
                log.info("The page apparently does not have links.")
                continue
            # process url
            if not overwrite and os.path.exists(to_txt.path):
                log.info("[OK] Text file already exists")
                finished += 1
            else:
                try:
                    save_text(from_file, to_txt, page.response.content)
                    log.info("[OK] Status: %s" % page.response.status_code)
                    finished += 1
                except Exception, e:
                    log.error(e)
                    log.error("[FAIL] Status: %s" % page.response.status_code)

            print("urls added: ", end="")
            for _url in links:
                if "href" not in _url:
                    continue
                _url = urljoin(url, _url["href"])
                if not visited[_url]:
                    visited[_url] = True
                    link = urlparse(_url)
                    if sameroot:
                        if link.netloc != siteinfo.netloc:
                            continue
                    perlevel[level + 1].append(_url)
                    total[level + 1] += 1
                    print("*", end="")

            print()
Exemple #19
0
 def test_page(self):
     """ Getting the web page yields correct response"""
     seek = Links(self.url)
     self.assertIsInstance(seek._page(), bs4.BeautifulSoup)