Esempi in Python per Spider.build_node

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: spider

Classe/tipologia: Spider

Metodo/funzione: build_node

Esempi su hotexamples.com: 3

Spider.build_node in Python: 3 esempi trovati. Questi sono i migliori esempi reali in Python per spider.Spider.build_node, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Spider(30)

crawl_page(30)

crawl(14)

__init__(8)

craw(4)

Search(4)

crawl_genre(3)

build_node(3)

analyse(3)

process_page(2)

court(2)

add_url(2)

content_list(2)

GetInfo(2)

crowl(1)

crowl_page(1)

GET(1)

crawled_page(1)

createResultExcel(1)

get2l_url(1)

crawledPage(1)

crawle_page_in_queue(1)

crawl_weather(1)

crawl_video_urls(1)

crawl_robots(1)

data(1)

getfilename(1)

get3l_url(1)

post(1)

update(1)

startCrawl(1)

setworkdir(1)

setfilename(1)

setDaemon(1)

responseCallback(1)

parse_blog(1)

getSoup(1)

linkCallback(1)

levelCallback(1)

is_valid(1)

is_outgoing(1)

htmlCallback(1)

get_pdfs(1)

crawl_page_graph(1)

crawl_async_slots(1)

crawl_next_page_from_queue(1)

authorized(1)

Process(1)

ReturnValues(1)

Text(1)

Esempio n. 1

Mostra file

File: window.py Progetto: Chocrates/wiki-web

    def node_two_selection(self, tree_view, path, column):
        ''' Event handler for list items in the node 2 grid '''
        # Block event handlers so we don't fire as we update them
        self.toggle_tree_view_activated_handler()

        row_num = tree_view.get_cursor()[0].get_indices()
        node = [
            x for x in self.node_lists[2]
            if x.title == tree_view.get_model()[row_num][0]
        ][0]

        self.logger.debug('Selected Node: ' + node.title)

        spider = Spider()

        if node.children is None:
            node = spider.build_node(node.base_url, node.parent, 1)

        self.set_node_lists(self.node_lists[1], self.node_lists[2],
                            node.children)

        self.tree_views[1].set_cursor(row_num)

        self.render_web_kit(node.base_url)

        self.toggle_tree_view_activated_handler()

Esempio n. 2

Mostra file

File: window.py Progetto: Chocrates/wiki-web

    def node_zero_selection(self, tree_view, path, column):
        ''' Event handler for list items in the node 0 grid '''
        try:
            # Block event handlers so we don't fire as we update them
            self.toggle_tree_view_activated_handler()

            # Grab the curent selected node
            row_num = tree_view.get_cursor()[0].get_indices()

            # Find the node from our
            node = [
                x for x in self.node_lists[0]
                if x.title == tree_view.get_model()[row_num][0]
            ][0]
            self.logger.debug('Selected Node: ' + node.title)
            spider = Spider()

            # Build our node out only if we don't have children
            if node.children is None:
                node = spider.build_node(node.base_url, node.parent, 1)

            # If this is the root node
            if node.parent is None:
                self.reset_root_node(node)
            # If this is node's parent is root, we can't set its peers
            elif node.parent.parent is None:
                self.set_node_lists(node.parent, node.parent.children,
                                    node.children)

            # If this node's parent has peers (IE its parent has a parent with chidlren)
            # We set the children to be the node0 list
            else:
                self.set_node_lists(node.parent.parent.children,
                                    node.parent.children, node.children)

            self.render_web_kit(node.base_url)
            self.tree_views[0].set_cursor(row_num)
            self.toggle_tree_view_activated_handler()

        except Exception as e:
            self.logger.error(e)

Esempio n. 3

Mostra file

class TestSpider(unittest.TestCase):
    '''
    Tests the Spider class and methods
    '''
    def setUp(self):
        self.spider = Spider()

    @patch('urllib.request.urlopen')
    def test_get_page_calls_url_open(self, mock_open):
        ''' Validate that we are calling the module with the passed in URL'''
        self.spider.get_page('test')
        mock_open.assert_called_with('test')


    @patch('spider.Spider.logger.error') # Remember the mock vars are in reverse order
    @patch('urllib.request.urlopen')
    def test_get_page_raises_exception(self, mock_open,mock_logger):
        ''' Validate that get_page() logs and rethrows the exception'''
        mock_open.side_effect = URLError('Test Exception')

        assert_that(calling(self.spider.get_page).with_args('test'),raises(URLError))
        mock_logger.assert_called()
        

    @patch('bs4.BeautifulSoup.__init__')
    def test_get_soup_calls_bs4_init(self, mock_soup):
        ''' Validate that we are calling the module with the passed in URL'''
        mock_soup.return_value = None
        self.spider.get_soup('test')

        # We want to validate that soup was called with 'test'
        # But we don't want the test to rely on the 'lxml' parser
        # Erring on the side of a slightly fragile test
        mock_soup.assert_called_with('test','lxml')


    @patch('spider.Spider.logger.error') 
    @patch('bs4.BeautifulSoup.__init__')
    def test_get_soup_raises_exception(self, mock_soup,mock_logger):
        ''' Validate that get_soup() logs and rethrows the exception'''
        # In reality this would be an HTMLParser.HTMLParseError or something
        # But instead of including all those dependencies, lets just validate
        # That is passes through the exception
        mock_soup.side_effect = ValueError('Test Exception')

        assert_that(calling(self.spider.get_soup).with_args('test'),raises(ValueError))
        mock_logger.assert_called()
        

    @patch('urllib.parse.urlparse')
    @patch('bs4.element.Tag.find_all')
    @patch('spider.Spider.get_soup')
    @patch('spider.Spider.logger.error') 
    def test_get_links_with_absolute_link_returns_page_links(self, mock_logger,mock_soup,mock_find_all, mock_parse):
        ''' Mock the external calls and verify that it doesn't prepend the suffix '''
        _prefix = 'test'
        pages = ['page_one', 'page_two']

        # Mock bs4 and make sure that it has a find_all method that returns an iterable
        mock_soup().find_all.return_value = [{'href':page} for page in pages]

        # Mock urlparse and make sure that it claims that we have an absolute link
        mock_parse().netloc = True

        out = self.spider.get_links('page',_prefix)
        assert_that(out, is_(equal_to(pages)))


    @patch('urllib.parse.urlparse')
    @patch('bs4.element.Tag.find_all')
    @patch('spider.Spider.get_soup')
    @patch('spider.Spider.logger.error') 
    def test_get_links_with_relative_link_returns_page_links(self, mock_logger,mock_soup,mock_find_all, mock_parse):
        ''' Mock the external calls and verify that it prepends the suffix if it is a rlative URL'''
        _prefix = 'test'
        pages = ['page_one', 'page_two']

        # Mock bs4 and make sure that it has a find_all method that returns an iterable
        mock_soup().find_all.return_value = [{'href':page} for page in pages]

        # Mock urlparse and make sure that it claims that we have an absolute link
        mock_parse().netloc = False

        out = self.spider.get_links('page',_prefix)
        assert_that(out, is_(equal_to([_prefix + x for x in pages])))


    @patch('urllib.parse.urlparse')
    @patch('bs4.element.Tag.find_all')
    @patch('spider.Spider.get_soup')
    @patch('spider.Spider.logger.error') 
    def test_get_links_with_invalid_link_logs_exception(self, mock_logger,mock_soup,mock_find_all, mock_parse):
        ''' Mock the external calls and verify that URLLib errors are logged and not thrown'''
        _prefix = 'test'
        pages = ['page_one', 'page_two']

        # Mock bs4 and make sure that it has a find_all method that returns an iterable
        mock_soup().find_all.return_value = [{'href':page} for page in pages]

        # Mock urlparse and make sure that it claims that we have an absolute link
        mock_parse.side_effect = ValueError('')
        out = self.spider.get_links('page',_prefix)
        assert_that(out, is_(equal_to([]))) # In this case all links are going to fail
        mock_logger.assert_called() # Validate that we logged the exception

    def test_filter_links_filters_on_prefix(self):
        ''' Test that filter_links will fileter out the invalid links '''
        _prefix = 'abc'
        urls = ['abc/wiki/1', 'abc/wiki/2', '3', 'any_other/domain/abc']
        self.spider.wiki_prefix = _prefix
        filtered_links = self.spider.filter_links(urls)

        assert_that(filtered_links,is_(equal_to(['abc/wiki/1', 'abc/wiki/2'])))



    @patch('urllib.parse.unquote')
    def test_build_title_from_url_returns_title(self, mock_parse):
        ''' Mock urllib and ensure that we call it with the last part of the URL'''
        self.spider.build_title_from_url('http://en.wikipedia.org/wiki/_test_name_')
        mock_parse.assert_called_with(' test name ')


    @patch('spider.Spider.logger.error') 
    def test_build_title_from_url_with_invalid_url_raises_valueerror(self, mock_logger):
        ''' Mock urllib and ensure any other URL raises an error and logs it'''
        assert_that(calling(self.spider.build_title_from_url).with_args('any other title really '),raises(ValueError))
        mock_logger.assert_called()
        
    def test_build_node_with_0_depth_returns_node(self):
        ''' Ensure that create a node with no children correctly'''
        url = 'http://en.wikipedia.org/wiki/title'
        expected_node = WikiNode(url,'title',None)
        returned_node = self.spider.build_node(url, None, 0)
        assert_that(returned_node, is_(equal_to(expected_node)))

    @patch('spider.Spider.filter_links')
    @patch('spider.Spider.get_links')
    @patch('spider.Spider.get_page')
    @patch('spider.Spider.get_soup')
    def test_build_node_with_1_depth_calls_build_node_on_children(self, mock_soup, mock_page, mock_links, mock_filter):
        ''' Ensure that we recursively call build_node on children and build the node correctly'''
        urls = {'title':'http://en.wikipedia.org/wiki/title',
                'b':'http://en.wikipedia.org/wiki/b',
                'a':'http://en.wikipedia.org/wiki/a'}

        expected_node = WikiNode(urls['title'],'title','parent')
        expected_child_a = WikiNode(urls['a'],'a',expected_node)
        expected_child_b = WikiNode(urls['b'],'b',None)
        expected_node.children = [expected_child_a, expected_child_b]

        mock_filter.return_value = [urls['b'], urls['a']]

        returned_node = self.spider.build_node(urls['title'], 'parent', 1)
        assert_that(returned_node, is_(equal_to(expected_node)))