Example #1
0
def crawl(c = None, seed = []):
	if c == None:
		c = Crawler(
			seed = seed, # your seed urls here 
			default_crawl_delay = 20, 
			obey_robots_txt = True,
			document_fetchers = 15,
			robots_txt_fetchers = 5) #start at least this many celery workers
	
	try:
		# start crawling, with this tasks specific termination criteria and 
		# a save period of 20 seconds
		c.crawl(
			termination_checker = example_task_termination_checker,
			save_frequency = timedelta(seconds = 20))
		
	finally:
		
		# if we were killed or finished, suspend crawl state to file.
		# revive the crawl with resume from crawler.py to explore results
		print "\nSuspended crawl to " + c.suspend()
		
		# print some statistics
		print "Downloaded bytes: " + str(cstats.downloaded_bytes(c))
		print "Discovered links: " + str(cstats.discovered_links(c))
		print "Discovered domains: " + str(cstats.discovered_domains(c))
		print "Runtime: " + str(cstats.runtime(c)) + " seconds"
		maxref = cstats.most_prolific_referer(c)
		# utf-8 printing problem in domain?
		print "Most prolific referrer was " + maxref["name"] + " with an average of " + str(maxref["avg_links_per_page"]) + " outgoing links per page."+"\n"
Example #2
0
def crawl(c=None, seed=[]):
    if c == None:
        c = Crawler(
            seed=seed,  # your seed urls here 
            default_crawl_delay=20,
            obey_robots_txt=True,
            document_fetchers=15,
            robots_txt_fetchers=5)  #start at least this many celery workers

    try:
        # start crawling, with this tasks specific termination criteria and
        # a save period of 20 seconds
        c.crawl(termination_checker=example_task_termination_checker,
                save_frequency=timedelta(seconds=20))

    finally:

        # if we were killed or finished, suspend crawl state to file.
        # revive the crawl with resume from crawler.py to explore results
        print "\nSuspended crawl to " + c.suspend()

        # print some statistics
        print "Downloaded bytes: " + str(cstats.downloaded_bytes(c))
        print "Discovered links: " + str(cstats.discovered_links(c))
        print "Discovered domains: " + str(cstats.discovered_domains(c))
        print "Runtime: " + str(cstats.runtime(c)) + " seconds"
        maxref = cstats.most_prolific_referer(c)
        # utf-8 printing problem in domain?
        print "Most prolific referrer was " + maxref[
            "name"] + " with an average of " + str(
                maxref["avg_links_per_page"]
            ) + " outgoing links per page." + "\n"
Example #3
0
def example_task_termination_checker(crawler):
	""" Checks for the specific termination critera for this task 
		Note that it is only checked once per pass of the
		crawl management loop, so exceeding the termination critera
		by some small number of items discovered is expected.
	"""
	return cstats.discovered_links(crawler) >= 10000 or cstats.discovered_domains(crawler) >= 100
Example #4
0
def example_task_termination_checker(crawler):
    """ Checks for the specific termination critera for this task 
		Note that it is only checked once per pass of the
		crawl management loop, so exceeding the termination critera
		by some small number of items discovered is expected.
	"""
    return cstats.discovered_links(
        crawler) >= 10000 or cstats.discovered_domains(crawler) >= 100