def bar_data(crawl_id, bar_no): """Returns the pages for a bar (interval) inside a crawl""" delegate = Delegate() # Find the total number of internal full links = T no_total = delegate.url_count_internal_full(crawl_id) # Select all pages pages = delegate.resource_get_all_by_crawl(crawl_id) (lower, upper) = no_2_interval(bar_no) selected_pages = [] for page in pages: no = delegate.url_count_incoming_for_resource(page.id) percent = no * 100 / no_total if upper == 100: if lower <= percent and percent <= upper: selected_pages.append({'page': page, 'no': no}) else: if lower <= percent and percent < upper: selected_pages.append({'page': page, 'no': no}) return selected_pages
def inner_links_data(crawl_id): delegate = Delegate() """ crawl_id - crawl id """ intervals = [] for i in range(0, 100, STEP): intervals.append([i, i + STEP]) print("Intervals %r " % intervals) # Select all pages pages = delegate.resource_get_all_by_crawl(crawl_id) # For every page select the no of internal full urls pointing to it. = Li d = dict() check = 0 for page in pages: no = delegate.url_count_incoming_for_resource(page.id) d[page.id] = no check = check + no for k, v in d.items(): print("\n%d -> %d" % (k, v)) # Find the total number of internal full links = T no_total = delegate.url_count_internal_full(crawl_id) print("Total full internal links: %d " % no_total) assert check == no_total, "The no of total internal links do not match" # For every page select the percent % of internal full urls pointing to it. Pi = Li * 100 / T percents = dict() for page in pages: percents[page.id] = d[page.id] * 100 / no_total print("\nPercentages") for k, v in percents.items(): print("\n%d -> %.2f%%" % (k, v)) # Count total links for every interval I1[0-10], I2[10-20],...., I10[90-100] the number of links for the pages # that fall into that interval # I1....Ti1...Pi1 = Ti1 *100 /T # I2....Ti2...Pi2 = Ti1 * 100 / T # Compute percentage of every interval partials = dict() labels = [] for interval in intervals: key = "{}-{}%".format(interval[0], interval[1]) labels.append(key) partials[key] = 0 for page in pages: if interval[1] == 100: if interval[0] <= percents[page.id] <= interval[1]: partials[key] = partials[key] + percents[page.id] else: if interval[0] <= percents[page.id] < interval[1]: partials[key] = partials[key] + percents[page.id] print("\nPartials") for k, v in partials.items(): print("\n{} {} ".format(k, v)) # Prepare the char data, sample bellow ''' { labels: ['Red', 'Blue', 'Yellow', 'Green', 'Purple', 'Orange'], datasets: [{ label: '# of Votes', data: [12, 19, 3, 5, 2, 3], backgroundColor: [ 'rgba(255, 99, 132, 0.2)', 'rgba(54, 162, 235, 0.2)', 'rgba(255, 206, 86, 0.2)', 'rgba(75, 192, 192, 0.2)', 'rgba(153, 102, 255, 0.2)', 'rgba(255, 159, 64, 0.2)' ], borderColor: [ 'rgba(255, 99, 132, 1)', 'rgba(54, 162, 235, 1)', 'rgba(255, 206, 86, 1)', 'rgba(75, 192, 192, 1)', 'rgba(153, 102, 255, 1)', 'rgba(255, 159, 64, 1)' ], borderWidth: 1 }] } ''' new_data = { 'labels': list(partials.keys()), 'datasets': [{ 'label': 'Inner links', 'data': list(partials.values()) }] } return new_data
def test_link(self): delegate = XDelegate() print("test_page started") # Site 1 site1 = Site() site1.name = "Site1" site1.url = 'http://foo.com' delegate.site_create(site1) # Crawl crawl = Crawl(site_id=site1.id) delegate.crawl_create(crawl) assert crawl.id > 0 # Page page = Resource() page.crawl_id = crawl.id page.content = "Ala bala portocala" page.absolute_url = "https://scriptoid.com/index.php" delegate.resource_create(page) # Link # Test url_is_present() p1 = delegate.url_is_present('https://scriptoid.com/index.php', crawl.id) assert not p1 # Test url_count_unvisited() n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 0, 'n1 is {}'.format(n1) # Test url_get_all_by_crawl_id() crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id) assert len(crawl_urls) == 0 # Test url_count_incoming_for_resource() uc1 = delegate.url_count_incoming_for_resource(page.id) assert uc1 == 0 # Test url_count_internal_full() cif = delegate.url_count_internal_full(crawl.id) assert cif == 0 url1 = Url() url1.src_resource_id = page.id url1.url = '/team' url1.absolute_url = 'https://scriptoid.com/team' url1.type = Url.TYPE_INTERNAL url1.crawl_id = crawl.id url1.job_status = Url.JOB_STATUS_IN_PROGRESS lid1 = delegate.url_create(url1) assert url1.id > 0 assert lid1 == url1.id url2 = Url() url2.src_resource_id = page.id url2.dst_resource_id = page.id url2.url = '/contact' url2.absolute_url = 'https://scriptoid.com/index.php' url2.type = Url.TYPE_INTERNAL url2.crawl_id = crawl.id delegate.url_create(url2) assert url2.id > 0 url3 = Url() url3.dst_resource_id = page.id url3.url = '/jobs' url3.absolute_url = 'https://scriptoid.com/jobs.php' url3.type = Url.TYPE_INTERNAL url3.crawl_id = crawl.id delegate.url_create(url3) assert url3.id > 0 # Test url_count_incoming_for_resource() uc1 = delegate.url_count_incoming_for_resource(page.id) assert uc1 == 1 # Test url_get_by_id() u1 = delegate.url_get_by_id(url1.id) assert u1.id == url1.id # Test url_is_present() p1 = delegate.url_is_present('https://scriptoid.com/index.php', crawl.id) assert p1 # Test url_get_all_by_crawl_id() crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id) assert len(crawl_urls) == 3 # Test first unvisited link l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id) assert l1.id == url2.id, 'l1.id = {} and url.id = {}'.format( l1.id, url2.id) # Test url_get_all_unvisited() unvisited1 = delegate.url_get_all_unvisited(crawl.id) assert len(unvisited1) == 2 # Test url_count_unvisited() n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 2, 'n1 is {}'.format(n1) n2 = delegate.url_count_visited(crawl_id=crawl.id) assert n2 == 0, 'Actually n2 is {}'.format(n2) url1.job_status = Url.JOB_STATUS_VISITED delegate.url_update(url1) l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id) assert l1.id == url2.id n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 2, 'n1 is {}'.format(n1) n2 = delegate.url_count_visited(crawl_id=crawl.id) assert n2 == 1, 'n2 is {}'.format(n2) # Test url_count_internal_full() cif = delegate.url_count_internal_full(crawl.id) assert cif == 1 # Test url_count_pending() ucp = delegate.url_count_pending(crawl.id) assert ucp == 2 # Test url_delete_all() delegate.url_delete_all() links = delegate.url_get_all() assert len(links) == 0, "When actually there are {}".format(len(links)) # Test url_count_external() uce = delegate.url_count_external(crawl.id) assert uce == 0 url4 = Url() url4.dst_resource_id = page.id url4.url = '/jobs' url4.absolute_url = 'https://scriptoid.com/jobs.php' url4.type = Url.TYPE_EXTERNAL url4.crawl_id = crawl.id delegate.url_create(url4) assert url4.id > 0 uce = delegate.url_count_external(crawl.id) assert uce == 1 assert delegate.url_delete_by_id(url4.id) # Test a cascade delete from parent Page resource_delete_all() to Link url = Url() url.src_resource_id = page.id url.url = '/contact' url.absolute_url = 'https://scriptoid.com/index.php' url.type = Url.TYPE_INTERNAL url.crawl_id = crawl.id delegate.url_create(url) assert url.id > 0 delegate.resource_delete_all() links = delegate.url_get_all() assert len(links) == 0, "When actually there are {}".format(len(links)) # Clean up # delegate.link_delete_all() delegate.resource_delete_all() delegate.crawl_delete_all() delegate.site_delete_all() print("test_page done")