def test_retrieveLinks(self):
        '''Test method for HttpLinksCollector.retrieve_links.
        
        '''
        
        # Test 1 - check HttpError 401 in log
        starting_url = "http://www.nature.com"
        target_url = \
            "http://www.nature.com/nature/journal/v438/n7070/full/438900a.html"

        http_links_collector = HttpLinksCollector(starting_url)        
        links_retrieved = http_links_collector.retrieve_links(target_url)
        
        self.assertTrue(not links_retrieved, \
                       "Retreved Links from:'" + target_url + "'")
        
        # Test 2 - check URLError - protocol irc.
        starting_url = "http://www.nature.com"
        target_url = "irc://irc.freenode.net/wikimedia-ayuda"

        http_links_collector = HttpLinksCollector(starting_url)        
        links_retrieved = http_links_collector.retrieve_links(target_url)
        
        self.assertTrue(not links_retrieved, \
                       "Retreved Links from:'" + target_url + "'")
Beispiel #2
0
    def crawler_start(self):
        '''Method to start crawling.
            * Checks input parameters.
            * returns the result of crawling printing a dictionary on the screen.
        
        '''

        # ArgParse definition rules
        parser = argparse.ArgumentParser(description="Let's crawl a web")

        parser.add_argument('url', nargs=1, help='target URL')
        parser.add_argument('-n', '--number-of-levels', type = int, \
                            default = 1, help = 'how depth the crawl will go.')

        # Create argument object
        args = parser.parse_args()

        target_url = args.url.pop()
        depth = args.number_of_levels

        # Starting level to retrieve links
        level = 1
        links = {}
        http_links_collector = HttpLinksCollector(target_url)

        links_list = http_links_collector.\
            retrieve_links(target_url, depth, level)

        links[target_url] = links_list

        links_result = json.dumps(links, sort_keys=True, indent=4)

        # Print result in json view mode.
        self.logger.info(links_result)
Beispiel #3
0
    def test_retrieveLinks(self):
        '''Test method for HttpLinksCollector.retrieve_links.
        
        '''

        # Test 1 - check HttpError 401 in log
        starting_url = "http://www.nature.com"
        target_url = \
            "http://www.nature.com/nature/journal/v438/n7070/full/438900a.html"

        http_links_collector = HttpLinksCollector(starting_url)
        links_retrieved = http_links_collector.retrieve_links(target_url)

        self.assertTrue(not links_retrieved, \
                       "Retreved Links from:'" + target_url + "'")

        # Test 2 - check URLError - protocol irc.
        starting_url = "http://www.nature.com"
        target_url = "irc://irc.freenode.net/wikimedia-ayuda"

        http_links_collector = HttpLinksCollector(starting_url)
        links_retrieved = http_links_collector.retrieve_links(target_url)

        self.assertTrue(not links_retrieved, \
                       "Retreved Links from:'" + target_url + "'")