Python discovered_links Examples

Programming Language: Python

Namespace/Package Name: cstats

Method/Function: discovered_links

Examples at hotexamples.com: 4

Python discovered_links - 4 examples found. These are the top rated real world Python examples of cstats.discovered_links extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: example.py Project: e6/crawler

def crawl(c = None, seed = []):
	if c == None:
		c = Crawler(
			seed = seed, # your seed urls here 
			default_crawl_delay = 20, 
			obey_robots_txt = True,
			document_fetchers = 15,
			robots_txt_fetchers = 5) #start at least this many celery workers
	
	try:
		# start crawling, with this tasks specific termination criteria and 
		# a save period of 20 seconds
		c.crawl(
			termination_checker = example_task_termination_checker,
			save_frequency = timedelta(seconds = 20))
		
	finally:
		
		# if we were killed or finished, suspend crawl state to file.
		# revive the crawl with resume from crawler.py to explore results
		print "\nSuspended crawl to " + c.suspend()
		
		# print some statistics
		print "Downloaded bytes: " + str(cstats.downloaded_bytes(c))
		print "Discovered links: " + str(cstats.discovered_links(c))
		print "Discovered domains: " + str(cstats.discovered_domains(c))
		print "Runtime: " + str(cstats.runtime(c)) + " seconds"
		maxref = cstats.most_prolific_referer(c)
		# utf-8 printing problem in domain?
		print "Most prolific referrer was " + maxref["name"] + " with an average of " + str(maxref["avg_links_per_page"]) + " outgoing links per page."+"\n"

Example #2

Show file

def crawl(c=None, seed=[]):
    if c == None:
        c = Crawler(
            seed=seed,  # your seed urls here 
            default_crawl_delay=20,
            obey_robots_txt=True,
            document_fetchers=15,
            robots_txt_fetchers=5)  #start at least this many celery workers

    try:
        # start crawling, with this tasks specific termination criteria and
        # a save period of 20 seconds
        c.crawl(termination_checker=example_task_termination_checker,
                save_frequency=timedelta(seconds=20))

    finally:

        # if we were killed or finished, suspend crawl state to file.
        # revive the crawl with resume from crawler.py to explore results
        print "\nSuspended crawl to " + c.suspend()

        # print some statistics
        print "Downloaded bytes: " + str(cstats.downloaded_bytes(c))
        print "Discovered links: " + str(cstats.discovered_links(c))
        print "Discovered domains: " + str(cstats.discovered_domains(c))
        print "Runtime: " + str(cstats.runtime(c)) + " seconds"
        maxref = cstats.most_prolific_referer(c)
        # utf-8 printing problem in domain?
        print "Most prolific referrer was " + maxref[
            "name"] + " with an average of " + str(
                maxref["avg_links_per_page"]
            ) + " outgoing links per page." + "\n"

Example #3

Show file

File: example.py Project: e6/crawler

def example_task_termination_checker(crawler):
	""" Checks for the specific termination critera for this task 
		Note that it is only checked once per pass of the
		crawl management loop, so exceeding the termination critera
		by some small number of items discovered is expected.
	"""
	return cstats.discovered_links(crawler) >= 10000 or cstats.discovered_domains(crawler) >= 100

Example #4

Show file

def example_task_termination_checker(crawler):
    """ Checks for the specific termination critera for this task 
		Note that it is only checked once per pass of the
		crawl management loop, so exceeding the termination critera
		by some small number of items discovered is expected.
	"""
    return cstats.discovered_links(
        crawler) >= 10000 or cstats.discovered_domains(crawler) >= 100