def test_link(self): delegate = XDelegate() print("test_page started") # Site 1 site1 = Site() site1.name = "Site1" site1.url = 'http://foo.com' delegate.site_create(site1) # Crawl crawl = Crawl(site_id=site1.id) delegate.crawl_create(crawl) assert crawl.id > 0 # Page page = Resource() page.crawl_id = crawl.id page.content = "Ala bala portocala" page.absolute_url = "https://scriptoid.com/index.php" delegate.resource_create(page) # Link # Test url_is_present() p1 = delegate.url_is_present('https://scriptoid.com/index.php', crawl.id) assert not p1 # Test url_count_unvisited() n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 0, 'n1 is {}'.format(n1) # Test url_get_all_by_crawl_id() crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id) assert len(crawl_urls) == 0 # Test url_count_incoming_for_resource() uc1 = delegate.url_count_incoming_for_resource(page.id) assert uc1 == 0 # Test url_count_internal_full() cif = delegate.url_count_internal_full(crawl.id) assert cif == 0 url1 = Url() url1.src_resource_id = page.id url1.url = '/team' url1.absolute_url = 'https://scriptoid.com/team' url1.type = Url.TYPE_INTERNAL url1.crawl_id = crawl.id url1.job_status = Url.JOB_STATUS_IN_PROGRESS lid1 = delegate.url_create(url1) assert url1.id > 0 assert lid1 == url1.id url2 = Url() url2.src_resource_id = page.id url2.dst_resource_id = page.id url2.url = '/contact' url2.absolute_url = 'https://scriptoid.com/index.php' url2.type = Url.TYPE_INTERNAL url2.crawl_id = crawl.id delegate.url_create(url2) assert url2.id > 0 url3 = Url() url3.dst_resource_id = page.id url3.url = '/jobs' url3.absolute_url = 'https://scriptoid.com/jobs.php' url3.type = Url.TYPE_INTERNAL url3.crawl_id = crawl.id delegate.url_create(url3) assert url3.id > 0 # Test url_count_incoming_for_resource() uc1 = delegate.url_count_incoming_for_resource(page.id) assert uc1 == 1 # Test url_get_by_id() u1 = delegate.url_get_by_id(url1.id) assert u1.id == url1.id # Test url_is_present() p1 = delegate.url_is_present('https://scriptoid.com/index.php', crawl.id) assert p1 # Test url_get_all_by_crawl_id() crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id) assert len(crawl_urls) == 3 # Test first unvisited link l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id) assert l1.id == url2.id, 'l1.id = {} and url.id = {}'.format( l1.id, url2.id) # Test url_get_all_unvisited() unvisited1 = delegate.url_get_all_unvisited(crawl.id) assert len(unvisited1) == 2 # Test url_count_unvisited() n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 2, 'n1 is {}'.format(n1) n2 = delegate.url_count_visited(crawl_id=crawl.id) assert n2 == 0, 'Actually n2 is {}'.format(n2) url1.job_status = Url.JOB_STATUS_VISITED delegate.url_update(url1) l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id) assert l1.id == url2.id n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 2, 'n1 is {}'.format(n1) n2 = delegate.url_count_visited(crawl_id=crawl.id) assert n2 == 1, 'n2 is {}'.format(n2) # Test url_count_internal_full() cif = delegate.url_count_internal_full(crawl.id) assert cif == 1 # Test url_count_pending() ucp = delegate.url_count_pending(crawl.id) assert ucp == 2 # Test url_delete_all() delegate.url_delete_all() links = delegate.url_get_all() assert len(links) == 0, "When actually there are {}".format(len(links)) # Test url_count_external() uce = delegate.url_count_external(crawl.id) assert uce == 0 url4 = Url() url4.dst_resource_id = page.id url4.url = '/jobs' url4.absolute_url = 'https://scriptoid.com/jobs.php' url4.type = Url.TYPE_EXTERNAL url4.crawl_id = crawl.id delegate.url_create(url4) assert url4.id > 0 uce = delegate.url_count_external(crawl.id) assert uce == 1 assert delegate.url_delete_by_id(url4.id) # Test a cascade delete from parent Page resource_delete_all() to Link url = Url() url.src_resource_id = page.id url.url = '/contact' url.absolute_url = 'https://scriptoid.com/index.php' url.type = Url.TYPE_INTERNAL url.crawl_id = crawl.id delegate.url_create(url) assert url.id > 0 delegate.resource_delete_all() links = delegate.url_get_all() assert len(links) == 0, "When actually there are {}".format(len(links)) # Clean up # delegate.link_delete_all() delegate.resource_delete_all() delegate.crawl_delete_all() delegate.site_delete_all() print("test_page done")
def test_page(self): delegate = XDelegate() print("test_page started") # Site 1 site1 = Site() site1.name = "Site1" site1.url = 'http://foo.com' delegate.site_create(site1) # Crawl crawl = Crawl(site_id=site1.id) delegate.crawl_create(crawl) assert crawl.id > 0 no_pages = delegate.resource_count_visited(crawl.id) assert no_pages == 0, "No of pages is {}".format(no_pages) # Page craw_resources = delegate.resource_get_all_by_crawl(crawl.id) assert len(craw_resources) == 0 # test resource_get_by_absolute_url_and_crawl_id() r1 = delegate.resource_get_by_absolute_url_and_crawl_id( "no such url :p", crawl.id) assert r1 == None # test resource_is_present() present = delegate.resource_is_present('no such url :p', crawl.id) assert not present page = Resource() page.crawl_id = crawl.id page.content = "A long content " + "a" * 1024 * 1024 page.absolute_url = "https://scriptoid.com/index.php" delegate.resource_create(page) assert page.id > 0 # test resource_get_by_id() r2 = delegate.resource_get_by_id(page.id) assert r2.id == page.id # test resource_is_present() present = delegate.resource_is_present(page.absolute_url, crawl.id) assert present pages = delegate.resource_get_all() assert len(pages) > 0 no_pages = delegate.resource_count_visited(crawl.id) assert no_pages == 1, "No of pages is {}".format(no_pages) craw_resources = delegate.resource_get_all_by_crawl(crawl.id) assert len(craw_resources) > 0 r1 = delegate.resource_get_by_absolute_url_and_crawl_id( page.absolute_url, crawl.id) assert r1.id == page.id # # Test cascade delete delegate.crawl_delete_all() pages = delegate.resource_get_all() assert len(pages) == 0, "It should be {} but we found {}".format( 0, len(pages)) # # Clean up delegate.resource_delete_all() delegate.crawl_delete_all() delegate.site_delete_all() print("test_page done")