def crawl_exe(): global PROGRESS_TRACKER global CRAWLS delegate = Delegate() @copy_current_request_context def notify(msg): crawlId = str(msg['crawlId']) # FIXME: Naive way to avoid leaks (byt keeping references) if msg['status'] == 'done': del CRAWLS[int(crawlId)] # progress = ProgressTracker._msg_to_progress(msg) # pj = jsonpickle.encode(progress) PROGRESS_TRACKER.set_progress(crawlId, msg) try: crawl = delegate.crawl_get_by_id(crawlId) except ValueError as ve: flash('No crawl id.') return redirect(url_for('crawl')) if not request.form['address']: flash('No address.') return redirect(url_for('crawl')) user = delegate.user_get_by_id(session['user_id']) sites = delegate.site_get_all( ) # TODO: In the future show only sites for current user # Save to DB crawl = Crawl(site_id=user.current_site_id) delegate.crawl_create(crawl) initial_url = request.form['address'] max_links = 0 if len(request.form['max']) > 0 and int(request.form['max']) > 0: max_links = int(request.form['max']) crawler = CrawlerDB(delegate, initial_url, id=crawl.id, no_workers=10, max_links=max_links) CRAWLS[crawl.id] = crawler crawler.addListener(notify) crawler.start() return render_template('crawl_progress.html', crawl=crawl, user=user, sites=sites)
def test_crawl(self): delegate = XDelegate() print("test_crawl started") # session = delegate.get_session() # Site 1 site1 = Site() site1.name = "Site1" site1.url = 'http://foo.com' delegate.site_create(site1) # Crawl crawl = Crawl(site_id=site1.id) delegate.crawl_create(crawl) assert crawl.id > 0 # Create a datetime 2 minutes in the past delta = datetime.timedelta(minutes=-2) t2 = crawl.date - delta crawl2 = Crawl(site_id=site1.id, date=t2) delegate.crawl_create(crawl2) assert crawl2.id > 0 sites = delegate.site_get_all() print("No of site: {}".format(len(sites))) assert len(sites) == 1 crawls = delegate.crawl_get_all() assert len(crawls) == 2 crawls2 = delegate.crawl_get_all_for_site(site1.id) assert len(crawls) == 2 last_crawl = delegate.crawl_get_last_for_site(site1.id) assert last_crawl.id == crawl2.id, "Last crawl id was {} when it should be {}".format( last_crawl.id, crawl2.id) # delegate.crawl_delete_all() delegate.site_delete_all() print("test_crawl done")
def main(): # domain = 'localhost:7000' domain = 'http://abctimetracking.com' max_links = 0 # Empty DB from manage_db import empty, mock mock() # Parse arguments parser = argparse.ArgumentParser(description="A simple website crawler.") parser.add_argument('-d', '--domain', type=str, default=domain, help='Domain to crawl', required=True) parser.add_argument('-w', '--workers', type=int, default=10, help='Number of workers') parser.add_argument('-m', '--max-links', type=int, default=0, help='Maximum no. of links to index') parser.add_argument('--delay', type=int, default=0, help='Delay between requests') args = parser.parse_args() if args.domain: domain = args.domain else: print('No domain passed, using %s.' % domain) print( 'Read usage details in file header for more information on passing arguments.' ) if args.max_links: max_links = args.max_links theURL = 'http://' + domain noOfWorkers = args.workers delegate = Delegate() site = Site(name=domain, url=theURL) delegate.site_create(site) crawl = Crawl(site_id=site.id) delegate.crawl_create(crawl) crawler = CrawlerDB(initialLink=theURL, max_links=max_links, no_workers=noOfWorkers, delegate=delegate, id=crawl.id) # crawler = CrawlerDB(max_links=max_links, no_workers=noOfWorkers, delegate=delegate, id=1) t1 = time.time() crawler.start() crawler.join() t2 = time.time() total_time = t2 - t1 logger.info("Total internal links visited: %d in: %ds" % (crawler.no_visited_urls(), total_time)) # for url in [link.absolute_url for link in crawler.visited]: # logger.info("\t" + url) logger.info("Total external links: %d" % crawler.no_external_urls()) # for url in [link.absolute_url for link in crawler.external_links]: # logger.info("\t" + url) # report('./crawl-requests-report.log', crawler.visited) # crawler.export() print("All done. In limbo") crawler = None time.sleep(10)
def test_link(self): delegate = XDelegate() print("test_page started") # Site 1 site1 = Site() site1.name = "Site1" site1.url = 'http://foo.com' delegate.site_create(site1) # Crawl crawl = Crawl(site_id=site1.id) delegate.crawl_create(crawl) assert crawl.id > 0 # Page page = Resource() page.crawl_id = crawl.id page.content = "Ala bala portocala" page.absolute_url = "https://scriptoid.com/index.php" delegate.resource_create(page) # Link # Test url_is_present() p1 = delegate.url_is_present('https://scriptoid.com/index.php', crawl.id) assert not p1 # Test url_count_unvisited() n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 0, 'n1 is {}'.format(n1) # Test url_get_all_by_crawl_id() crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id) assert len(crawl_urls) == 0 # Test url_count_incoming_for_resource() uc1 = delegate.url_count_incoming_for_resource(page.id) assert uc1 == 0 # Test url_count_internal_full() cif = delegate.url_count_internal_full(crawl.id) assert cif == 0 url1 = Url() url1.src_resource_id = page.id url1.url = '/team' url1.absolute_url = 'https://scriptoid.com/team' url1.type = Url.TYPE_INTERNAL url1.crawl_id = crawl.id url1.job_status = Url.JOB_STATUS_IN_PROGRESS lid1 = delegate.url_create(url1) assert url1.id > 0 assert lid1 == url1.id url2 = Url() url2.src_resource_id = page.id url2.dst_resource_id = page.id url2.url = '/contact' url2.absolute_url = 'https://scriptoid.com/index.php' url2.type = Url.TYPE_INTERNAL url2.crawl_id = crawl.id delegate.url_create(url2) assert url2.id > 0 url3 = Url() url3.dst_resource_id = page.id url3.url = '/jobs' url3.absolute_url = 'https://scriptoid.com/jobs.php' url3.type = Url.TYPE_INTERNAL url3.crawl_id = crawl.id delegate.url_create(url3) assert url3.id > 0 # Test url_count_incoming_for_resource() uc1 = delegate.url_count_incoming_for_resource(page.id) assert uc1 == 1 # Test url_get_by_id() u1 = delegate.url_get_by_id(url1.id) assert u1.id == url1.id # Test url_is_present() p1 = delegate.url_is_present('https://scriptoid.com/index.php', crawl.id) assert p1 # Test url_get_all_by_crawl_id() crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id) assert len(crawl_urls) == 3 # Test first unvisited link l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id) assert l1.id == url2.id, 'l1.id = {} and url.id = {}'.format( l1.id, url2.id) # Test url_get_all_unvisited() unvisited1 = delegate.url_get_all_unvisited(crawl.id) assert len(unvisited1) == 2 # Test url_count_unvisited() n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 2, 'n1 is {}'.format(n1) n2 = delegate.url_count_visited(crawl_id=crawl.id) assert n2 == 0, 'Actually n2 is {}'.format(n2) url1.job_status = Url.JOB_STATUS_VISITED delegate.url_update(url1) l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id) assert l1.id == url2.id n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 2, 'n1 is {}'.format(n1) n2 = delegate.url_count_visited(crawl_id=crawl.id) assert n2 == 1, 'n2 is {}'.format(n2) # Test url_count_internal_full() cif = delegate.url_count_internal_full(crawl.id) assert cif == 1 # Test url_count_pending() ucp = delegate.url_count_pending(crawl.id) assert ucp == 2 # Test url_delete_all() delegate.url_delete_all() links = delegate.url_get_all() assert len(links) == 0, "When actually there are {}".format(len(links)) # Test url_count_external() uce = delegate.url_count_external(crawl.id) assert uce == 0 url4 = Url() url4.dst_resource_id = page.id url4.url = '/jobs' url4.absolute_url = 'https://scriptoid.com/jobs.php' url4.type = Url.TYPE_EXTERNAL url4.crawl_id = crawl.id delegate.url_create(url4) assert url4.id > 0 uce = delegate.url_count_external(crawl.id) assert uce == 1 assert delegate.url_delete_by_id(url4.id) # Test a cascade delete from parent Page resource_delete_all() to Link url = Url() url.src_resource_id = page.id url.url = '/contact' url.absolute_url = 'https://scriptoid.com/index.php' url.type = Url.TYPE_INTERNAL url.crawl_id = crawl.id delegate.url_create(url) assert url.id > 0 delegate.resource_delete_all() links = delegate.url_get_all() assert len(links) == 0, "When actually there are {}".format(len(links)) # Clean up # delegate.link_delete_all() delegate.resource_delete_all() delegate.crawl_delete_all() delegate.site_delete_all() print("test_page done")
def test_page(self): delegate = XDelegate() print("test_page started") # Site 1 site1 = Site() site1.name = "Site1" site1.url = 'http://foo.com' delegate.site_create(site1) # Crawl crawl = Crawl(site_id=site1.id) delegate.crawl_create(crawl) assert crawl.id > 0 no_pages = delegate.resource_count_visited(crawl.id) assert no_pages == 0, "No of pages is {}".format(no_pages) # Page craw_resources = delegate.resource_get_all_by_crawl(crawl.id) assert len(craw_resources) == 0 # test resource_get_by_absolute_url_and_crawl_id() r1 = delegate.resource_get_by_absolute_url_and_crawl_id( "no such url :p", crawl.id) assert r1 == None # test resource_is_present() present = delegate.resource_is_present('no such url :p', crawl.id) assert not present page = Resource() page.crawl_id = crawl.id page.content = "A long content " + "a" * 1024 * 1024 page.absolute_url = "https://scriptoid.com/index.php" delegate.resource_create(page) assert page.id > 0 # test resource_get_by_id() r2 = delegate.resource_get_by_id(page.id) assert r2.id == page.id # test resource_is_present() present = delegate.resource_is_present(page.absolute_url, crawl.id) assert present pages = delegate.resource_get_all() assert len(pages) > 0 no_pages = delegate.resource_count_visited(crawl.id) assert no_pages == 1, "No of pages is {}".format(no_pages) craw_resources = delegate.resource_get_all_by_crawl(crawl.id) assert len(craw_resources) > 0 r1 = delegate.resource_get_by_absolute_url_and_crawl_id( page.absolute_url, crawl.id) assert r1.id == page.id # # Test cascade delete delegate.crawl_delete_all() pages = delegate.resource_get_all() assert len(pages) == 0, "It should be {} but we found {}".format( 0, len(pages)) # # Clean up delegate.resource_delete_all() delegate.crawl_delete_all() delegate.site_delete_all() print("test_page done")