def crawl_view_pages(): delegate = Delegate() try: id = request.args.get('id', type=int) crawl = delegate.crawl_get_by_id(id) # links = delegate.url_get_all_by_crawl_id(id) resources = delegate.resource_get_all_by_crawl(id) user = delegate.user_get_by_id(session['user_id']) sites = delegate.site_get_all( ) # TODO: In the future show only sites for current user return render_template('crawl_view_pages.html', crawl=crawl, resources=resources, user=user, sites=sites) except ValueError as ve: flash('No crawl id.') return redirect(url_for('crawl'))
def bar_data(crawl_id, bar_no): """Returns the pages for a bar (interval) inside a crawl""" delegate = Delegate() # Find the total number of internal full links = T no_total = delegate.url_count_internal_full(crawl_id) # Select all pages pages = delegate.resource_get_all_by_crawl(crawl_id) (lower, upper) = no_2_interval(bar_no) selected_pages = [] for page in pages: no = delegate.url_count_incoming_for_resource(page.id) percent = no * 100 / no_total if upper == 100: if lower <= percent and percent <= upper: selected_pages.append({'page': page, 'no': no}) else: if lower <= percent and percent < upper: selected_pages.append({'page': page, 'no': no}) return selected_pages
def inner_links_data(crawl_id): delegate = Delegate() """ crawl_id - crawl id """ intervals = [] for i in range(0, 100, STEP): intervals.append([i, i + STEP]) print("Intervals %r " % intervals) # Select all pages pages = delegate.resource_get_all_by_crawl(crawl_id) # For every page select the no of internal full urls pointing to it. = Li d = dict() check = 0 for page in pages: no = delegate.url_count_incoming_for_resource(page.id) d[page.id] = no check = check + no for k, v in d.items(): print("\n%d -> %d" % (k, v)) # Find the total number of internal full links = T no_total = delegate.url_count_internal_full(crawl_id) print("Total full internal links: %d " % no_total) assert check == no_total, "The no of total internal links do not match" # For every page select the percent % of internal full urls pointing to it. Pi = Li * 100 / T percents = dict() for page in pages: percents[page.id] = d[page.id] * 100 / no_total print("\nPercentages") for k, v in percents.items(): print("\n%d -> %.2f%%" % (k, v)) # Count total links for every interval I1[0-10], I2[10-20],...., I10[90-100] the number of links for the pages # that fall into that interval # I1....Ti1...Pi1 = Ti1 *100 /T # I2....Ti2...Pi2 = Ti1 * 100 / T # Compute percentage of every interval partials = dict() labels = [] for interval in intervals: key = "{}-{}%".format(interval[0], interval[1]) labels.append(key) partials[key] = 0 for page in pages: if interval[1] == 100: if interval[0] <= percents[page.id] <= interval[1]: partials[key] = partials[key] + percents[page.id] else: if interval[0] <= percents[page.id] < interval[1]: partials[key] = partials[key] + percents[page.id] print("\nPartials") for k, v in partials.items(): print("\n{} {} ".format(k, v)) # Prepare the char data, sample bellow ''' { labels: ['Red', 'Blue', 'Yellow', 'Green', 'Purple', 'Orange'], datasets: [{ label: '# of Votes', data: [12, 19, 3, 5, 2, 3], backgroundColor: [ 'rgba(255, 99, 132, 0.2)', 'rgba(54, 162, 235, 0.2)', 'rgba(255, 206, 86, 0.2)', 'rgba(75, 192, 192, 0.2)', 'rgba(153, 102, 255, 0.2)', 'rgba(255, 159, 64, 0.2)' ], borderColor: [ 'rgba(255, 99, 132, 1)', 'rgba(54, 162, 235, 1)', 'rgba(255, 206, 86, 1)', 'rgba(75, 192, 192, 1)', 'rgba(153, 102, 255, 1)', 'rgba(255, 159, 64, 1)' ], borderWidth: 1 }] } ''' new_data = { 'labels': list(partials.keys()), 'datasets': [{ 'label': 'Inner links', 'data': list(partials.values()) }] } return new_data
def test_page(self): delegate = XDelegate() print("test_page started") # Site 1 site1 = Site() site1.name = "Site1" site1.url = 'http://foo.com' delegate.site_create(site1) # Crawl crawl = Crawl(site_id=site1.id) delegate.crawl_create(crawl) assert crawl.id > 0 no_pages = delegate.resource_count_visited(crawl.id) assert no_pages == 0, "No of pages is {}".format(no_pages) # Page craw_resources = delegate.resource_get_all_by_crawl(crawl.id) assert len(craw_resources) == 0 # test resource_get_by_absolute_url_and_crawl_id() r1 = delegate.resource_get_by_absolute_url_and_crawl_id( "no such url :p", crawl.id) assert r1 == None # test resource_is_present() present = delegate.resource_is_present('no such url :p', crawl.id) assert not present page = Resource() page.crawl_id = crawl.id page.content = "A long content " + "a" * 1024 * 1024 page.absolute_url = "https://scriptoid.com/index.php" delegate.resource_create(page) assert page.id > 0 # test resource_get_by_id() r2 = delegate.resource_get_by_id(page.id) assert r2.id == page.id # test resource_is_present() present = delegate.resource_is_present(page.absolute_url, crawl.id) assert present pages = delegate.resource_get_all() assert len(pages) > 0 no_pages = delegate.resource_count_visited(crawl.id) assert no_pages == 1, "No of pages is {}".format(no_pages) craw_resources = delegate.resource_get_all_by_crawl(crawl.id) assert len(craw_resources) > 0 r1 = delegate.resource_get_by_absolute_url_and_crawl_id( page.absolute_url, crawl.id) assert r1.id == page.id # # Test cascade delete delegate.crawl_delete_all() pages = delegate.resource_get_all() assert len(pages) == 0, "It should be {} but we found {}".format( 0, len(pages)) # # Clean up delegate.resource_delete_all() delegate.crawl_delete_all() delegate.site_delete_all() print("test_page done")