def crawl_spider(spider): ############### TRAIN SPIDER ############## if spider == "train": # initialize the scheduling queue q = Scheduler() # initialize all of the pipelines pipeline = [] for pipe in settings.PIPELINES: try: pipeline.append( getattr( pipelines, pipe )() ) except: print "Error: Unable to initialize %s pipe" % pipe quit() # initialize the spider # try: # s = getattr(spiders, spider)() # except: # print "Error: It's likely that the input spider does not exist in spiders.py" # quit() s = spiders.Train() #print s.__doc__ # add all of the start links and known links to the top level of the queue for url in list(s.start_urls) + list(s.known_urls): q.add_link(url, 0) q.print_queue() # request urls while scheduler not empty and pass to to spider # add returned links to the queue # send returned items down the pipeline visits = 0 while not q.is_empty(): wait_between_requests() # wait a random small amount of time so we're less detectable url, level = q.get_next_link(what_level=True) print "Visit #%i, Q level %i, Q volume %i" % (visits, level, q.queue_volume()) response = get_request(url) if response: items, extracted_links = s.parse(response, level=level) # links and items are both links #print "exctracted links:", links add_to_queue(q, extracted_links) # manage the returned links send_down_pipeline(pipeline, items, s) # manage the returned items if settings.ASK_BETWEEN_REQUESTS: raw_input("Press ENTER to continue?") visits += 1 if q.is_empty(): print "CRAWL IS FINISHED: Queue is empty" #if visits >= settings.MAX_CRAWLS: print "CRAWL IS FINISHED: Crawled max number of urls (%i total)" % visits ################ TEST SPIDER ############## elif spider == "test": print "Test case" q = PriorityQueue() queued_links = set() # initialize all of the pipelines pipeline = [] for pipe in settings.PIPELINES: try: pipeline.append( getattr( pipelines, pipe )() ) except: print "Error: Unable to initialize %s pipe" % pipe quit() # initialize the spider # try: # s = spiders.Test() # except: # print "Error: It's likely that the input spider does not exist in spiders.py" # quit() #print s.__doc__ s = spiders.Test() # add all of the start links and known links to the top level of the queue q.put((-.1, s.start_urls[0])) queued_links.add(s.start_urls[0]) # request urls while scheduler not empty and pass to to spider # add returned links to the queue # send returned items down the pipeline visits = 0 while not q.empty(): wait_between_requests() # wait a random small amount of time so we're less detectable priority, url = q.get() print "Q get:", -priority, url print "Visit #%i, Q volume %i" % (visits, q.qsize()) response = get_request(url) if response: items, extracted_links = s.parse(response, level=-priority) # links and items are both links # print "Extracted item: ",items #print "extracted links:", extracted_links #print "exctracted links:", links for link in extracted_links: if link[1] not in queued_links: # print link q.put((-link[0], link[1])) queued_links.add(link[1]) # else: # print "We already queued %s" % link[1] send_down_pipeline(pipeline, items, s) # manage the returned items if settings.ASK_BETWEEN_REQUESTS: raw_input("Press ENTER to continue?") visits += 1 if q.empty(): print "CRAWL IS FINISHED: Queue is empty" #if visits >= settings.MAX_CRAWLS: print "CRAWL IS FINISHED: Crawled max number of urls (%i total)" % visits else: quit()