def bar_data(crawl_id, bar_no): """Returns the pages for a bar (interval) inside a crawl""" delegate = Delegate() # Find the total number of internal full links = T no_total = delegate.url_count_internal_full(crawl_id) # Select all pages pages = delegate.resource_get_all_by_crawl(crawl_id) (lower, upper) = no_2_interval(bar_no) selected_pages = [] for page in pages: no = delegate.url_count_incoming_for_resource(page.id) percent = no * 100 / no_total if upper == 100: if lower <= percent and percent <= upper: selected_pages.append({'page': page, 'no': no}) else: if lower <= percent and percent < upper: selected_pages.append({'page': page, 'no': no}) return selected_pages
def site_add(): delegate = Delegate() page = request.args.get('page', type=str) user = delegate.user_get_by_id(session['user_id']) page = request.form['page'] site_name = request.form['site'] site_url = request.form['url'] site = Site(name=site_name, url=site_url) delegate.site_create(site) user.current_site_id = site.id delegate.site_update(site) if page == 'home': return redirect(url_for('home')) else: return redirect(url_for('sites'))
def __init__(self, delegate, initialLink=None, max_links=0, no_workers=10, id=str(uuid.uuid4())): Thread.__init__(self) self.noOfWorkers = no_workers self.workers = [] self.running = True self.paused = False self.condition = RLock() self.delegate = Delegate() self.listeners = [ ] # A list of listeners that want to listen to messages (ex: progress) from Crawler self.id = id self.initialLink = initialLink if initialLink is not None: self.add_initial_url(initialLink) self.max_links = max_links try: self.domain_regex = re.compile(get_domain(initialLink)) except Exception as ex: logging.error("Exception {}".format(ex))
import os import argparse import sys from amalgam.delegatex import XDelegate from amalgam import database from amalgam.models.modelsx import User, Site, metadata delegate = XDelegate() def drop_tables(): """Drop all tables""" metadata.drop_all(database.engine) # if database.SQLALCHEMY_DATABASE == 'sqlite': # db_full_path = os.path.abspath('./amalgam.db') # if os.path.isfile(db_full_path): # print("Removing old database: {}".format(db_full_path)) # os.remove(db_full_path) # else: # print("No database present at : {}".format(db_full_path)) # elif database.SQLALCHEMY_DATABASE == 'mysql': # # TODO: Add a drop all table # # Maybe: https://stackoverflow.com/questions/11233128/how-to-clean-the-database-dropping-all-records-using-sqlalchemy # pass def create_tables(): """Create all tables if needed""" metadata.create_all(database.engine)
def main(): # domain = 'localhost:7000' domain = 'http://abctimetracking.com' max_links = 0 # Empty DB from manage_db import empty, mock mock() # Parse arguments parser = argparse.ArgumentParser(description="A simple website crawler.") parser.add_argument('-d', '--domain', type=str, default=domain, help='Domain to crawl', required=True) parser.add_argument('-w', '--workers', type=int, default=10, help='Number of workers') parser.add_argument('-m', '--max-links', type=int, default=0, help='Maximum no. of links to index') parser.add_argument('--delay', type=int, default=0, help='Delay between requests') args = parser.parse_args() if args.domain: domain = args.domain else: print('No domain passed, using %s.' % domain) print( 'Read usage details in file header for more information on passing arguments.' ) if args.max_links: max_links = args.max_links theURL = 'http://' + domain noOfWorkers = args.workers delegate = Delegate() site = Site(name=domain, url=theURL) delegate.site_create(site) crawl = Crawl(site_id=site.id) delegate.crawl_create(crawl) crawler = CrawlerDB(initialLink=theURL, max_links=max_links, no_workers=noOfWorkers, delegate=delegate, id=crawl.id) # crawler = CrawlerDB(max_links=max_links, no_workers=noOfWorkers, delegate=delegate, id=1) t1 = time.time() crawler.start() crawler.join() t2 = time.time() total_time = t2 - t1 logger.info("Total internal links visited: %d in: %ds" % (crawler.no_visited_urls(), total_time)) # for url in [link.absolute_url for link in crawler.visited]: # logger.info("\t" + url) logger.info("Total external links: %d" % crawler.no_external_urls()) # for url in [link.absolute_url for link in crawler.external_links]: # logger.info("\t" + url) # report('./crawl-requests-report.log', crawler.visited) # crawler.export() print("All done. In limbo") crawler = None time.sleep(10)
class CrawlerDB(Thread): def __init__(self, delegate, initialLink=None, max_links=0, no_workers=10, id=str(uuid.uuid4())): Thread.__init__(self) self.noOfWorkers = no_workers self.workers = [] self.running = True self.paused = False self.condition = RLock() self.delegate = Delegate() self.listeners = [ ] # A list of listeners that want to listen to messages (ex: progress) from Crawler self.id = id self.initialLink = initialLink if initialLink is not None: self.add_initial_url(initialLink) self.max_links = max_links try: self.domain_regex = re.compile(get_domain(initialLink)) except Exception as ex: logging.error("Exception {}".format(ex)) # def __del__(self): # self.delegate.get_session().close() def add_initial_url(self, address): logger.info("Add initial URL") with self.condition: url = Url(url=address, absolute_url=address, type=Url.TYPE_INTERNAL, crawl_id=self.id, job_status=Url.JOB_STATUS_NOT_VISITED) self.delegate.url_create(url) def no_unvisited_urls(self): with self.condition: return self.delegate.url_count_unvisited(self.id) def no_pending_urls(self): with self.condition: return self.delegate.url_count_pending(self.id) def all_unvisited_urls(self): with self.condition: return self.delegate.url_get_all_unvisited(self.id) def no_visited_urls(self): with self.condition: return self.delegate.url_count_visited(self.id) def no_visited_resources(self): with self.condition: return self.delegate.resource_count_visited(self.id) def no_external_urls(self): with self.condition: return self.delegate.url_count_external(self.id) def next_unvisited_link_id(self): link_id = -1 with self.condition: url = self.delegate.url_get_first_unvisited(self.id) if url is not None: url.job_status = Url.JOB_STATUS_IN_PROGRESS # Set Url as in progress self.delegate.url_update(url) # self.increaseNoOfJobs() link_id = url.id return link_id def mark_url_as_visited(self, url_id): with self.condition: url = self.delegate.url_get_by_id(url_id) url.job_status = Url.JOB_STATUS_VISITED self.delegate.url_update(url) def _type_links(self, links): for link in links: if is_internal(get_domain(self.initialLink), link['absolute']): # internal link link['type'] = 'internal' else: # external link link['type'] = 'external' def _get_links(self, link_id): with self.condition: link = self.delegate.url_get_by_id(link_id) (page, links) = get_links(link.absolute_url) self._type_links(links) return page, links def link2url(self, link): url = Url(crawl_id=self.id) # url=link['href'], absolute_url=link['absolute'], type=link['type'], if 'href' in link: url.url = link['href'] if 'absolute' in link: url.absolute_url = link['absolute'] if 'type' in link: url.type = link['type'] if 'content' in link: url.raw_content = str(link['content']) url.text = str( link['content'] ) # TODO: Parse the raw_content and used only the text without HTML tags or other stuff return url def page2resource(self, page): resource = Resource(crawl_id=self.id) if 'url' in page: resource.absolute_url = page['url'] if 'content' in page: resource.content = page['content'] if 'elapsed' in page: resource.elapsed = page['elapsed'] return resource def add_links(self, links, src_resource_id=None, status_code=200): """Add a bunch of URLs using the resource id as source (page where found it)""" with self.condition: for link in links: url = self.link2url(link) if src_resource_id is not None: url.src_resource_id = src_resource_id # Check if destination resource exists, and if does mark it as visited try: src_resource = self.delegate.resource_get_by_id( src_resource_id) dest_resource = self.delegate.resource_get_by_absolute_url_and_crawl_id( url.absolute_url, src_resource.crawl_id) if dest_resource is not None: url.job_status = Url.JOB_STATUS_VISITED url.dst_resource_id = dest_resource.id url.status_code = status_code except Exception as e: logger.warning("Exception {}".format(e)) self.delegate.url_create(url) def add_resource(self, page): with self.condition: if not self.delegate.resource_is_present(crawlId=self.id): resource = self.page2resource(page) self.delegate.resource_create(resource) def connect_url_to_destination(self, url_id, resource_id): with self.condition: url = self.delegate.url_get_by_id(url_id) url.dst_resource_id = resource_id self.delegate.url_update(url) def resource_get_by_absolute_url_and_crawl_id(self, address, crawler_id): with self.condition: resource = self.delegate.resource_get_by_absolute_url_and_crawl_id( address, crawler_id) return resource def resource_create(self, page): with self.condition: try: resource = self.page2resource(page) self.delegate.resource_create(resource) except Exception as e: logger.warn("{} Exception {}}.".format( currentThread().getName(), e)) return resource def run(self): # Initialize workers for i in range(self.noOfWorkers): self.workers.append( Thread(target=self.workerJob, kwargs={"crawlId": self.id}, name="Thread-{}".format(i))) # Start workers self._start_all_workers() while self.running: logger.debug("[%s] Crawler thread cycle started." % (currentThread().getName())) if self.paused: logger.debug("[%s] Crawler paused." % (currentThread().getName())) continue logger.debug("[%s] Crawler check if jobs are done." % (currentThread().getName())) if self._are_jobs_done(): logger.debug("Crawler is shutting down") self.setRunning(False) break else: logger.debug("[%s] Crawler's jos are NOT done." % (currentThread().getName())) logger.debug("[%s] Crawler sleep." % (currentThread().getName())) time.sleep(1) # Join them self._join_all_workers() # self.delegate.get_session().close() msg = { "status": "done", "visited": self.no_visited_urls(), "to_visit": self.no_unvisited_urls(), "max_links": self.max_links, "crawlId": self.id } self.notify(msg) def workerJob(self, crawlId): while self.running: logger.debug("[%s] Worker thread cycle started." % (currentThread().getName())) if self.paused: continue # If max pages specified see if we already reached it if self.max_links > 0: no_pages_visited = self.no_visited_resources() if no_pages_visited >= self.max_links: continue # Grab next job link_id = self.next_unvisited_link_id() logger.debug("[%s] Next link [%d]." % (currentThread().getName(), link_id)) if 'link_id' in locals() and link_id != -1: logger.debug("[%s] Current link : %d" % (currentThread().getName(), link_id)) page, links = self._get_links(link_id) logger.debug("[%s] Discovered [%d] links." % (currentThread().getName(), len(links))) try: with self.condition: # Update links status code url = self.delegate.url_get_by_id(link_id) url.status_code = page['status-code'] self.delegate.url_update(url) if page['status-code'] == 200: # 1.Add Resource 2.Link URLs to (new | existing) Resources resource = self.resource_get_by_absolute_url_and_crawl_id( page['url'], self.id) if resource is None: #Add it only if max links not reached maximum_reached = False if self.max_links > 0: # We have a max_link specified no_pages_visited = self.no_visited_resources( ) if no_pages_visited >= self.max_links: maximum_reached = True if not maximum_reached: resource = self.resource_create(page) self.connect_url_to_destination( link_id, resource.id) logger.debug( "[%s] Adding links to DB linked to resource [%d]" % (currentThread().getName(), resource.id)) self.add_links(links, resource.id, page['status-code']) else: # Resource already added only make the end connection self.connect_url_to_destination( link_id, resource.id) else: pass self.mark_url_as_visited(link_id) msg = { "status": "in_progress", "visited": self.no_visited_urls(), "to_visit": self.no_unvisited_urls(), "max_links": self.max_links, "crawlId": crawlId, "currentWorker": currentThread().getName() } self.notify(msg) except Exception as e: print("Error {}".format(e)) logger.debug("[%s] cycle ended." % (currentThread().getName())) else: logger.debug("[%s] is shutting down." % (currentThread().getName())) # self.delegate.get_session().close() def stop(self): self.setRunning(False) def pause(self): self.paused = True def resume(self): if self.paused: self.paused = False def _start_all_workers(self): for w in self.workers: w.start() def _are_jobs_done(self): # Test if noOfJobs == 0 and to_visit == 0 # no_of_jobs = self.getNoOfJobs() # FIXME: If a thread grabs the initial link, while here, no_unvisited_urls() will # return zero (on next line) , also the no_of_jobs are zero so the Crawler # will initiate shutdown no_pending_urls = self.no_pending_urls() logger.debug("Crawler: _are_jobs_done(...) : no_pendind_urls = %d " % (no_pending_urls, )) if no_pending_urls == 0: return True # Test if we have reached the max no of pages if self.max_links > 0: no_pages_visited = self.no_visited_resources() if no_pages_visited >= self.max_links: return True return False def _join_all_workers(self): for w in self.workers: w.join() def setRunning(self, status): self.running = status def addListener(self, callback): self.listeners.append(callback) def removeListener(self, callback): self.listeners.remove(callback) def notify(self, msg): for callback in self.listeners: callback(msg)
def inner_links_data(crawl_id): delegate = Delegate() """ crawl_id - crawl id """ intervals = [] for i in range(0, 100, STEP): intervals.append([i, i + STEP]) print("Intervals %r " % intervals) # Select all pages pages = delegate.resource_get_all_by_crawl(crawl_id) # For every page select the no of internal full urls pointing to it. = Li d = dict() check = 0 for page in pages: no = delegate.url_count_incoming_for_resource(page.id) d[page.id] = no check = check + no for k, v in d.items(): print("\n%d -> %d" % (k, v)) # Find the total number of internal full links = T no_total = delegate.url_count_internal_full(crawl_id) print("Total full internal links: %d " % no_total) assert check == no_total, "The no of total internal links do not match" # For every page select the percent % of internal full urls pointing to it. Pi = Li * 100 / T percents = dict() for page in pages: percents[page.id] = d[page.id] * 100 / no_total print("\nPercentages") for k, v in percents.items(): print("\n%d -> %.2f%%" % (k, v)) # Count total links for every interval I1[0-10], I2[10-20],...., I10[90-100] the number of links for the pages # that fall into that interval # I1....Ti1...Pi1 = Ti1 *100 /T # I2....Ti2...Pi2 = Ti1 * 100 / T # Compute percentage of every interval partials = dict() labels = [] for interval in intervals: key = "{}-{}%".format(interval[0], interval[1]) labels.append(key) partials[key] = 0 for page in pages: if interval[1] == 100: if interval[0] <= percents[page.id] <= interval[1]: partials[key] = partials[key] + percents[page.id] else: if interval[0] <= percents[page.id] < interval[1]: partials[key] = partials[key] + percents[page.id] print("\nPartials") for k, v in partials.items(): print("\n{} {} ".format(k, v)) # Prepare the char data, sample bellow ''' { labels: ['Red', 'Blue', 'Yellow', 'Green', 'Purple', 'Orange'], datasets: [{ label: '# of Votes', data: [12, 19, 3, 5, 2, 3], backgroundColor: [ 'rgba(255, 99, 132, 0.2)', 'rgba(54, 162, 235, 0.2)', 'rgba(255, 206, 86, 0.2)', 'rgba(75, 192, 192, 0.2)', 'rgba(153, 102, 255, 0.2)', 'rgba(255, 159, 64, 0.2)' ], borderColor: [ 'rgba(255, 99, 132, 1)', 'rgba(54, 162, 235, 1)', 'rgba(255, 206, 86, 1)', 'rgba(75, 192, 192, 1)', 'rgba(153, 102, 255, 1)', 'rgba(255, 159, 64, 1)' ], borderWidth: 1 }] } ''' new_data = { 'labels': list(partials.keys()), 'datasets': [{ 'label': 'Inner links', 'data': list(partials.values()) }] } return new_data
def sites(): delegate = Delegate() user = delegate.user_get_by_id(session['user_id']) sites = delegate.site_get_all( ) # TODO: In the future show only sites for current user return render_template('sites.html', user=user, sites=sites)
def report_inner_incomming_urls(): resource_id = request.args.get('resource_id', type=int) delegate = Delegate() pages = delegate.resource_get_all_incoming_for_resource(resource_id) jdata = jsonpickle.encode(pages) return jdata
def test_crawl(self): delegate = XDelegate() print("test_crawl started") # session = delegate.get_session() # Site 1 site1 = Site() site1.name = "Site1" site1.url = 'http://foo.com' delegate.site_create(site1) # Crawl crawl = Crawl(site_id=site1.id) delegate.crawl_create(crawl) assert crawl.id > 0 # Create a datetime 2 minutes in the past delta = datetime.timedelta(minutes=-2) t2 = crawl.date - delta crawl2 = Crawl(site_id=site1.id, date=t2) delegate.crawl_create(crawl2) assert crawl2.id > 0 sites = delegate.site_get_all() print("No of site: {}".format(len(sites))) assert len(sites) == 1 crawls = delegate.crawl_get_all() assert len(crawls) == 2 crawls2 = delegate.crawl_get_all_for_site(site1.id) assert len(crawls) == 2 last_crawl = delegate.crawl_get_last_for_site(site1.id) assert last_crawl.id == crawl2.id, "Last crawl id was {} when it should be {}".format( last_crawl.id, crawl2.id) # delegate.crawl_delete_all() delegate.site_delete_all() print("test_crawl done")
def test_user(self): delegate = XDelegate() u1 = User() u1.email = "*****@*****.**" u1.password = "******" u1.name = "One" delegate.user_create(u1) assert u1.id > 0 u2 = delegate.user_get_by_email_and_password(u1.email, u1.password) assert u1.email == u2.email assert u1.password == u2.password assert u1.id == u2.id, "U1's id:{} U2's id:{} ".format(u1.id, u2.id) u2.name = u2.name + 'x' r = delegate.user_update(u2) # assert r u3 = delegate.user_get_by_id(u2.id) assert u2.id == u3.id assert u2.name == u3.name users1 = delegate.user_get_all() assert len(users1) == 1 r = delegate.user_delete_by_id(u1.id) assert r users1 = delegate.user_get_all() assert len(users1) == 0 delegate.user_delete_all() users2 = delegate.user_get_all() assert len(users2) == 0
def test_link(self): delegate = XDelegate() print("test_page started") # Site 1 site1 = Site() site1.name = "Site1" site1.url = 'http://foo.com' delegate.site_create(site1) # Crawl crawl = Crawl(site_id=site1.id) delegate.crawl_create(crawl) assert crawl.id > 0 # Page page = Resource() page.crawl_id = crawl.id page.content = "Ala bala portocala" page.absolute_url = "https://scriptoid.com/index.php" delegate.resource_create(page) # Link # Test url_is_present() p1 = delegate.url_is_present('https://scriptoid.com/index.php', crawl.id) assert not p1 # Test url_count_unvisited() n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 0, 'n1 is {}'.format(n1) # Test url_get_all_by_crawl_id() crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id) assert len(crawl_urls) == 0 # Test url_count_incoming_for_resource() uc1 = delegate.url_count_incoming_for_resource(page.id) assert uc1 == 0 # Test url_count_internal_full() cif = delegate.url_count_internal_full(crawl.id) assert cif == 0 url1 = Url() url1.src_resource_id = page.id url1.url = '/team' url1.absolute_url = 'https://scriptoid.com/team' url1.type = Url.TYPE_INTERNAL url1.crawl_id = crawl.id url1.job_status = Url.JOB_STATUS_IN_PROGRESS lid1 = delegate.url_create(url1) assert url1.id > 0 assert lid1 == url1.id url2 = Url() url2.src_resource_id = page.id url2.dst_resource_id = page.id url2.url = '/contact' url2.absolute_url = 'https://scriptoid.com/index.php' url2.type = Url.TYPE_INTERNAL url2.crawl_id = crawl.id delegate.url_create(url2) assert url2.id > 0 url3 = Url() url3.dst_resource_id = page.id url3.url = '/jobs' url3.absolute_url = 'https://scriptoid.com/jobs.php' url3.type = Url.TYPE_INTERNAL url3.crawl_id = crawl.id delegate.url_create(url3) assert url3.id > 0 # Test url_count_incoming_for_resource() uc1 = delegate.url_count_incoming_for_resource(page.id) assert uc1 == 1 # Test url_get_by_id() u1 = delegate.url_get_by_id(url1.id) assert u1.id == url1.id # Test url_is_present() p1 = delegate.url_is_present('https://scriptoid.com/index.php', crawl.id) assert p1 # Test url_get_all_by_crawl_id() crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id) assert len(crawl_urls) == 3 # Test first unvisited link l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id) assert l1.id == url2.id, 'l1.id = {} and url.id = {}'.format( l1.id, url2.id) # Test url_get_all_unvisited() unvisited1 = delegate.url_get_all_unvisited(crawl.id) assert len(unvisited1) == 2 # Test url_count_unvisited() n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 2, 'n1 is {}'.format(n1) n2 = delegate.url_count_visited(crawl_id=crawl.id) assert n2 == 0, 'Actually n2 is {}'.format(n2) url1.job_status = Url.JOB_STATUS_VISITED delegate.url_update(url1) l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id) assert l1.id == url2.id n1 = delegate.url_count_unvisited(crawl_id=crawl.id) assert n1 == 2, 'n1 is {}'.format(n1) n2 = delegate.url_count_visited(crawl_id=crawl.id) assert n2 == 1, 'n2 is {}'.format(n2) # Test url_count_internal_full() cif = delegate.url_count_internal_full(crawl.id) assert cif == 1 # Test url_count_pending() ucp = delegate.url_count_pending(crawl.id) assert ucp == 2 # Test url_delete_all() delegate.url_delete_all() links = delegate.url_get_all() assert len(links) == 0, "When actually there are {}".format(len(links)) # Test url_count_external() uce = delegate.url_count_external(crawl.id) assert uce == 0 url4 = Url() url4.dst_resource_id = page.id url4.url = '/jobs' url4.absolute_url = 'https://scriptoid.com/jobs.php' url4.type = Url.TYPE_EXTERNAL url4.crawl_id = crawl.id delegate.url_create(url4) assert url4.id > 0 uce = delegate.url_count_external(crawl.id) assert uce == 1 assert delegate.url_delete_by_id(url4.id) # Test a cascade delete from parent Page resource_delete_all() to Link url = Url() url.src_resource_id = page.id url.url = '/contact' url.absolute_url = 'https://scriptoid.com/index.php' url.type = Url.TYPE_INTERNAL url.crawl_id = crawl.id delegate.url_create(url) assert url.id > 0 delegate.resource_delete_all() links = delegate.url_get_all() assert len(links) == 0, "When actually there are {}".format(len(links)) # Clean up # delegate.link_delete_all() delegate.resource_delete_all() delegate.crawl_delete_all() delegate.site_delete_all() print("test_page done")
def test_page(self): delegate = XDelegate() print("test_page started") # Site 1 site1 = Site() site1.name = "Site1" site1.url = 'http://foo.com' delegate.site_create(site1) # Crawl crawl = Crawl(site_id=site1.id) delegate.crawl_create(crawl) assert crawl.id > 0 no_pages = delegate.resource_count_visited(crawl.id) assert no_pages == 0, "No of pages is {}".format(no_pages) # Page craw_resources = delegate.resource_get_all_by_crawl(crawl.id) assert len(craw_resources) == 0 # test resource_get_by_absolute_url_and_crawl_id() r1 = delegate.resource_get_by_absolute_url_and_crawl_id( "no such url :p", crawl.id) assert r1 == None # test resource_is_present() present = delegate.resource_is_present('no such url :p', crawl.id) assert not present page = Resource() page.crawl_id = crawl.id page.content = "A long content " + "a" * 1024 * 1024 page.absolute_url = "https://scriptoid.com/index.php" delegate.resource_create(page) assert page.id > 0 # test resource_get_by_id() r2 = delegate.resource_get_by_id(page.id) assert r2.id == page.id # test resource_is_present() present = delegate.resource_is_present(page.absolute_url, crawl.id) assert present pages = delegate.resource_get_all() assert len(pages) > 0 no_pages = delegate.resource_count_visited(crawl.id) assert no_pages == 1, "No of pages is {}".format(no_pages) craw_resources = delegate.resource_get_all_by_crawl(crawl.id) assert len(craw_resources) > 0 r1 = delegate.resource_get_by_absolute_url_and_crawl_id( page.absolute_url, crawl.id) assert r1.id == page.id # # Test cascade delete delegate.crawl_delete_all() pages = delegate.resource_get_all() assert len(pages) == 0, "It should be {} but we found {}".format( 0, len(pages)) # # Clean up delegate.resource_delete_all() delegate.crawl_delete_all() delegate.site_delete_all() print("test_page done")