def query_link_array(self, crawl_links): '''takes a crawl_link array (which has links and types of objects) and decides which of these links were quieried for. Return List of URIs that are matched resources not in the set already discovered''' if self.qry_resource_type is not None: log.info('SEARCH_LIST: looking for singular: %s', self.qry_resource_type) log.info('SEARCH_LIST: looking for plural as item_list: %s', self.qry_resource_plural) if self.qry_resource_title is not None: log.info('SEARCH_LIST: looking for title: %s', self.qry_resource_title) matching_uris = [] #(1) if resource name exists, filter items to get only items that #match the singular resource name, AND (things that match the plural #resource name && are from_item_list) #(2) if title exists, filter items remaining for those that match the title for link_item in crawl_links: log.debug('SEARCH_LIST: checking if %s matches query criteria', link_item['href']) this_link_item_matches = True #see if it matches resource_type, if queried for if self.qry_resource_type is not None: if ((any(link_item['type'].lower() in x for x in self.qry_resource_plural) and link_item['from_item_list']) \ or (link_item['type'].lower() == self.qry_resource_type)): #it does! log.info('SEARCH_LIST: matched search_type %s', link_item['type']) else: #it doesn't, but we're searching on resource_type this_link_item_matches = False #see if it matches resource_title, if queried for if self.qry_resource_title is not None: if (link_item['title'].lower() == self.qry_resource_title): #it does! log.info('SEARCH_LIST: matched search_title %s', link_item['title']) else: #it doesn't, but we're searching on resource_title this_link_item_matches = False #if we made it to here and this_link_item_matches, it's a match! if this_link_item_matches: matching_uris.append(link_item['href']) #return list of matching uris return matching_uris
def __init__(self, entry_point='http://learnair.media.mit.edu:8000/', \ crawl_delay=1000, filter_keywords=['previous','next']): #entry_point = starting URL for crawl #search_depth = how many steps in path we save to retrace when at a dead end #found_set_persistence = how long, in min, to keep a resource URI in memory # before it is allowed to be returned as a new resource again. 720= 12 # hours before crawler 'forgets' it has seen something and resubmits it # in the queue to be processed #crawl_delay = how long, in ms, before accessing/crawling a new resource self.entry_point = entry_point #entry point URI #initialize crawl variables self.current_uri = entry_point #keep track of current location self.current_uri_type = 'entry_point' self.crawl_delay = crawl_delay #in milliseconds self.degrees = 0 self.return_if_found = False self.createform_type = None self.found_resources = TimeDecaySet(0) #initialize filter word list for crawling self.filter_keywords = ['edit','create','self','curies','websocket'] [self.filter_keywords.append(x) for x in filter_keywords] log.debug( "filter keywords %s", self.filter_keywords) log.info( "-----------------------------------------------" ) log.info( "Crawler Initialized." ) log.info( "Entry Point: %s", self.entry_point ) log.info( "-----------------------------------------------" )
def __init__(self, mask_length=8): '''initializes fixed size hash table (2^mask_length entries), preallocates using C for speed and size. Each stored value in the table is a cityHash64 value (64 bits), so the hash table can support (theoretically) up to 2^64 entries. Defaults to 2^8 entries (256 entries). The assumption is that with a uniform probability distribution, hash collisions while crawling a local area of the internet are unlikely. Instead of storing a linked list at each hash table index, we will only store the most recent hash value. This may cause us to re-crawl websites, but again it should be fine for keeping the crawler from local loops or hyper-local crawl behavior. Values are stored based on a bitmask over the 64 bit hash. ex: 'http://test.com' hashes to '0x1234567887654321', and the cache table size is 2^8, or 256, so we apply an 8 bit mask of 0xff (& 255) to the hash. This gives us hashtable[0x21] = 0x1234567887654321.''' log.info( "-----------------------------------------------" ) log.info( "---- Setting up cache ----" ) self._cache_table_mask_length = mask_length self._cache_mask = (2**self._cache_table_mask_length) - 1 self._cache = array.array('L',(0 for i in range (self._cache_mask+1))) if (self._cache.itemsize < 8): log.error("Cache Item Size is too small to represent 64 bit CityHash Value") raise TypeError("Cache Item Size is too small to represent 64 bit CityHash Value") log.info( 'cache length = %s, size = %s kB, mask = b{0:b}'.format(self._cache_mask), \ len(self._cache), (sys.getsizeof(self._cache)/1000.0) ) log.info( "-----------------------------------------------" )
def push_uris_to_queue(self, uris): '''check uris against found_resources set, and if they're not there, get resource and push URI and resource out to queue''' found_one = False #self.found_resources for uri in uris: #if 'add' returns true, it's not in our set yet if self.found_resources.add(uri): log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('New Resource Found! %s', uri) log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') found_one = True return found_one
def __init__(self, entry_point='http://learnair.media.mit.edu:8000/', \ cache_table_mask_length=8, track_search_depth=5, \ found_set_persistence=720, crawl_delay=1000, filter_keywords=['previous','next']): #entry_point = starting URL for crawl #search_depth = how many steps in path we save to retrace when at a dead end #found_set_persistence = how long, in min, to keep a resource URI in memory # before it is allowed to be returned as a new resource again. 720= 12 # hours before crawler 'forgets' it has seen something and resubmits it # in the queue to be processed #crawl_delay = how long, in ms, before accessing/crawling a new resource self.entry_point = entry_point #entry point URI #initialize crawl variables self.current_uri = entry_point #keep track of current location self.current_uri_type = 'entry_point' self.current_uri_title = 'entry_point' self.crawl_history = LeakyLIFO(track_search_depth) #keep track of past self.crawl_delay = crawl_delay #in milliseconds self.found_resources = TimeDecaySet(found_set_persistence) #in seconds #initialize cache self.cache = CrawlerCacheWithCollisionHistory(cache_table_mask_length) #initialize queue/zmq variables self.q = None self.zmq = None self.find_called = False #initialize filter word list for crawling self.filter_keywords = ['edit','create','self','curies','websocket'] [self.filter_keywords.append(x) for x in filter_keywords] log.debug( "filter keywords %s", self.filter_keywords) log.info( "-----------------------------------------------" ) log.info( "Crawler Initialized." ) log.info( "Entry Point: %s", self.entry_point ) log.info( "-----------------------------------------------" )
def __init__(self, mask_length=8): '''initializes fixed size hash table (2^mask_length entries), preallocates using C for speed and size. Each stored value in the table is a cityHash64 value (64 bits), so the hash table can support (theoretically) up to 2^64 entries. Defaults to 2^8 entries (256 entries). The assumption is that with a uniform probability distribution, hash collisions while crawling a local area of the internet are unlikely. Instead of storing a linked list at each hash table index, we will only store the most recent hash value. This may cause us to re-crawl websites, but again it should be fine for keeping the crawler from local loops or hyper-local crawl behavior. Values are stored based on a bitmask over the 64 bit hash. ex: 'http://test.com' hashes to '0x1234567887654321', and the cache table size is 2^8, or 256, so we apply an 8 bit mask of 0xff (& 255) to the hash. This gives us hashtable[0x21] = 0x1234567887654321.''' log.info("-----------------------------------------------") log.info("---- Setting up cache ----") self._cache_table_mask_length = mask_length self._cache_mask = (2**self._cache_table_mask_length) - 1 self._cache = array.array('L', (0 for i in range(self._cache_mask + 1))) if (self._cache.itemsize < 8): log.error( "Cache Item Size is too small to represent 64 bit CityHash Value" ) raise TypeError( "Cache Item Size is too small to represent 64 bit CityHash Value" ) log.info( 'cache length = %s, size = %s kB, mask = b{0:b}'.format(self._cache_mask), \ len(self._cache), (sys.getsizeof(self._cache)/1000.0) ) log.info("-----------------------------------------------")
def crawl_node(self): #put uri in cache now that we're crawling it, make a note of collisions if self.cache.put_and_collision(self.current_uri): log.info( 'HASH COLLISION: value overwritten in hash table.' ) #debug: print state of cache after updating log.debug('CACHE STATE: %s', self.cache._cache) #download the current resource try: req = requests.get(self.current_uri) log.info( '%s downloaded.', self.current_uri ) #downloading the current resource failed except requests.exceptions.ConnectionError: log.warn( 'URI "%s" unresponsive, moving back to previous link...',\ self.current_uri ) #if we failed to download the entry point, give up if self.current_uri == self.entry_point: log.error( 'URI is entry point, no previous link. Try again when' \ + ' the entry point URI is available.' ) return False #if it wasn't the entry point, go back in our search history try: prev = self.crawl_history.pop() self.current_uri = prev['href'] self.current_uri_type = prev['type'] self.current_uri_title = prev['title'] return True #if we don't have any history left, go back to the entry point except: log.info( 'exhausted depth of search history, back to entry point' ) self.current_uri = self.entry_point self.current_uri_type = "entry_point" self.current_uri_title = "entry_point" return True #end downloading resource #put request in JSON form, apply CURIES, get links resource_json = req.json() log.debug('HAL/JSON RAW RESOURCE: %s', resource_json) req_links = self.apply_hal_curies(resource_json)['_links'] crawl_links = self.get_external_links(req_links) #crawl_links is a 'flat' list list[:][fields] #fields are href, type, title, in_cache, from_item_list log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \ 'self, create/edit, ws, itemlist flattened): %s', crawl_links) #find the uris/resources that match search criteria! if self.qry_extra is None: #we don't need to actually download the link to see if it matches matching_uris = self.query_link_array(crawl_links) else: #we only have enough information to tell if the current node matches matching_uris = self.query_current_node(resource_json) #... and send them out!! if (self.push_uris_to_queue(matching_uris) and self.find_called): return False #end crawl if we found one and 'find' was called #select next link!!!! #get uncached links uncached_links = [x for x in crawl_links if not x['in_cache']] log.info('CRAWL: %s LINKS UNCACHED OF %s LINKS FOUND', \ len(uncached_links), len(crawl_links) ) if (len(uncached_links)>0): #we have uncached link(s) to follow! randomly pick one. random_index = random.randrange(0,len(uncached_links)) self.crawl_history.push({'href':self.current_uri, 'type':self.current_uri_type, 'title':self.current_uri_title}) self.current_uri = uncached_links[random_index]['href'] self.current_uri_type = uncached_links[random_index]['type'] self.current_uri_title = uncached_links[random_index]['title'] else: #we don't have any uncached options from this node. Damn. log.info('CRAWL: no new links available here, crawling back up history') #special case of being at the entry point if (self.current_uri_type == 'entry_point'): #double check we have something to crawl if (len(crawl_links) > 0): log.info('CRAWL: no uncached links from entrypoint, resetting cache') self.cache.clear() # clear cache #randomly select node from crawl_links random_index = random.randrange(0,len(crawl_links)) self.crawl_history.push({'href':self.current_uri, 'type':self.current_uri_type, 'title':self.current_uri_title}) self.current_uri = crawl_links[random_index]['href'] self.current_uri_type = crawl_links[random_index]['type'] self.current_uri_title = crawl_links[random_index]['title'] else: log.error('CRAWL: NO CRAWLABLE LINKS DETECTED AT ENTRY_POINT!!!!') return False #not at entry point, time to try and move back up in history try: prev = self.crawl_history.pop() self.current_uri = prev['href'] self.current_uri_type = prev['type'] self.current_uri_title = prev['title'] except: #no history left, not at entry point- jump to entry point log.info('CRAWL: crawling back up history, but exhausted history. Jump to entrypoint.') self.current_uri= self.entry_point self.current_uri_type = 'entry_point' self.current_uri_title = 'entry_point' log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('CRAWL: crawling to %s : %s', self.current_uri_title.upper(), self.current_uri) log.info('CRAWL: type: %s', self.current_uri_type) log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') #recurse return True
def crawl(self, namespace="", resource_type=None, \ plural_resource_type=None, resource_title=None, resource_extra=None): ''' crawl through chain, pushing uri/resource that match the passed criteria onto the queue. If nothing is passed, push all resources. Can match the resource_type. If you want a resource list (plural, i.e. lists of organizations resources NOT organization resources), you can specify that as the resource_type even though it is the plural. The code assumes the word can be pluralized by adding an 's' or 'es' to the end. If this is not true (i.e. Person -> People) please give the plural so the code can recognize when it has found a list of the singular resource of interest. if looking for a specific resource, this will cross check against the title of the resource. Selection will be ANDED with other query criteria. ''' #store search criteria in lowercase form, with namespace appended #add plural forms +'s', +'es' to list of plural cases to look for if resource_type is not None: #append namespace self.qry_resource_type = namespace + resource_type #make all lowercase self.qry_resource_type = self.qry_resource_type.lower() #'pluralize' resource after adding namespace self.qry_resource_plural = self.pluralize_resource_name(self.qry_resource_type) #add special pluralization if given by user if plural_resource_type is not None: self.qry_resource_plural.append(namespace + plural_resource_type) #make all plural list items lowercase self.qry_resource_plural = [x.lower() for x in self.qry_resource_plural] else: #not searching on resource_type, just define qry_resource_type as None self.qry_resource_type = None if resource_title is not None: #make all lowercase self.qry_resource_title = resource_title.lower() else: #not searching on title, just define qry_resource_title as None self.qry_resource_title = None if resource_extra is not None: self.qry_extra = resource_extra else: self.qry_extra = None #end initializing query variables loop_count=0 #keep calling crawl_node, unless it returns false, with a pause between while(self.crawl_node()): #delay for crawl_delay ms between calls time.sleep(self.crawl_delay/1000.0) #count loop iterations loop_count = loop_count + 1 log.info( "MAIN CRAWL LOOP ITERATION %s -----------------", loop_count ) log.info( "--- crawling ended, %s pages crawled ---", loop_count ) return self.found_resources
def push_uris_to_queue(self, uris): '''check uris against found_resources set, and if they're not there, get resource and push URI and resource out to queue''' #self.found_resources found_one = False for uri in uris: #if 'add' returns true, it's not in our set yet if self.found_resources.add(uri): log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('New Resource Found! %s', uri) log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') found_one = True #push uri and resource to queue! if isinstance(self.q, Queue.Queue): log.info('QUEUE: Pushing to queue') self.q.put(uri) elif self.zmq is not None: log.info('QUEUE: Pusing to ZMQ socket') self.zmq.send_string(uri) else: log.warn('QUEUE: Queue and ZMQ Socket undefined') return found_one
def query_current_node(self, json): matching_uris = [] if self.qry_resource_type is not None: log.info('SEARCH_LIST: looking for singular: %s', self.qry_resource_type) if self.qry_resource_title is not None: log.info('SEARCH_LIST: looking for title: %s', self.qry_resource_title) if self.qry_extra is not None: log.info('SEARCH_LIST: looking for %s', self.qry_extra) this_link_item_matches = True if self.qry_resource_type is not None: if (any(self.current_uri_type.lower() in x for x in self.qry_resource_plural) \ or self.current_uri_type.lower() == self.qry_resource_type): #it does! log.info('SEARCH_LIST: matched search_type %s', self.current_uri_type) else: #it doesn't, but we're searching on resource_type this_link_item_matches = False #see if it matches resource_title, if queried for if self.qry_resource_title is not None: if (self.current_uri_title.lower() == self.qry_resource_title): #it does! log.info('SEARCH_LIST: matched search_title %s', self.current_uri_title) else: #it doesn't, but we're searching on resource_title this_link_item_matches = False if self.qry_extra is not None: for key, val in self.qry_extra.iteritems(): try: actual_val = json[key] if actual_val == val: log.info('SEARCH_LIST: matched search_extra %s: %s', key, val) else: this_link_item_matches = False except: this_link_item_matches = False #if we made it to here and this_link_item_matches, it's a match! if this_link_item_matches: matching_uris.append(self.current_uri) #return list of matching uris return matching_uris
def bfs(self): current_depth = 0 visited = set() link_tree = [[] for k in range(self.degrees)] while True: time.sleep(self.crawl_delay/1000.0) #download the current resource try: req = requests.get(self.current_uri) log.info( '%s downloaded.', self.current_uri ) #put request in JSON form, apply CURIES, get links resource_json = req.json() log.debug('HAL/JSON RAW RESOURCE: %s', resource_json) #downloading the current resource failed except requests.exceptions.ConnectionError: log.warn( 'URI "%s" unresponsive, ignoring',\ self.current_uri ) resource_json = {'_links':[]} #if we failed to download the entry point, give up if self.current_uri == self.entry_point: log.error( 'URI is entry point, no previous link. Try again when' \ + ' the entry point URI is available.' ) return #end downloading resource #get links from this resource req_links = self.apply_hal_curies(resource_json)['_links'] crawl_links = self.flatten_filter_link_array(req_links) #crawl_links is a 'flat' list list[:][fields] #fields are href, type, title, in_cache, from_item_list log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \ 'self, create/edit, ws, itemlist flattened): %s', crawl_links) #find the uris/resources that match search criteria! matching_uris = self.query_link_array(crawl_links) #... and send them out!! if (self.push_uris_to_queue(matching_uris) and self.return_if_found): return #return if we are using find_first and we found one #push all uris that don't match visited to proper depth list visited.add(self.current_uri) if current_depth < self.degrees: [link_tree[current_depth].append(x) for x in crawl_links \ if not x['href'] in visited] log.debug('BFS Array: %s', link_tree) log.debug('VISITED: %s', visited) #select next current_uri and current_uri_type by looking through #link_tree, if empty return finished = True for index in range(len(link_tree)): if len(link_tree[index]): self.current_uri = link_tree[index][0]['href'] self.current_uri_type = link_tree[index][0]['type'] del link_tree[index][0] current_depth = index + 1 finished = False break if finished: return log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('CRAWL: moving to %s', self.current_uri) log.info('CRAWL: type: %s', self.current_uri_type) log.info('CRAWL: depth: %s', current_depth) log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
def crawl_node(self): #put uri in cache now that we're crawling it, make a note of collisions if self.cache.put_and_collision(self.current_uri): log.info('HASH COLLISION: value overwritten in hash table.') #debug: print state of cache after updating log.debug('CACHE STATE: %s', self.cache._cache) #download the current resource try: req = requests.get(self.current_uri) log.info('%s downloaded.', self.current_uri) #downloading the current resource failed except requests.exceptions.ConnectionError: log.warn( 'URI "%s" unresponsive, moving back to previous link...',\ self.current_uri ) #if we failed to download the entry point, give up if self.current_uri == self.entry_point: log.error( 'URI is entry point, no previous link. Try again when' \ + ' the entry point URI is available.' ) return False #if it wasn't the entry point, go back in our search history try: prev = self.crawl_history.pop() self.current_uri = prev['href'] self.current_uri_type = prev['type'] self.current_uri_title = prev['title'] return True #if we don't have any history left, go back to the entry point except: log.info( 'exhausted depth of search history, back to entry point') self.current_uri = self.entry_point self.current_uri_type = "entry_point" self.current_uri_title = "entry_point" return True #end downloading resource #put request in JSON form, apply CURIES, get links resource_json = req.json() log.debug('HAL/JSON RAW RESOURCE: %s', resource_json) req_links = self.apply_hal_curies(resource_json)['_links'] crawl_links = self.get_external_links(req_links) #crawl_links is a 'flat' list list[:][fields] #fields are href, type, title, in_cache, from_item_list log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \ 'self, create/edit, ws, itemlist flattened): %s', crawl_links) #find the uris/resources that match search criteria! if self.qry_extra is None: #we don't need to actually download the link to see if it matches matching_uris = self.query_link_array(crawl_links) else: #we only have enough information to tell if the current node matches matching_uris = self.query_current_node(resource_json) #... and send them out!! if (self.push_uris_to_queue(matching_uris) and self.find_called): return False #end crawl if we found one and 'find' was called #select next link!!!! #get uncached links uncached_links = [x for x in crawl_links if not x['in_cache']] log.info('CRAWL: %s LINKS UNCACHED OF %s LINKS FOUND', \ len(uncached_links), len(crawl_links) ) if (len(uncached_links) > 0): #we have uncached link(s) to follow! randomly pick one. random_index = random.randrange(0, len(uncached_links)) self.crawl_history.push({ 'href': self.current_uri, 'type': self.current_uri_type, 'title': self.current_uri_title }) self.current_uri = uncached_links[random_index]['href'] self.current_uri_type = uncached_links[random_index]['type'] self.current_uri_title = uncached_links[random_index]['title'] else: #we don't have any uncached options from this node. Damn. log.info( 'CRAWL: no new links available here, crawling back up history') #special case of being at the entry point if (self.current_uri_type == 'entry_point'): #double check we have something to crawl if (len(crawl_links) > 0): log.info( 'CRAWL: no uncached links from entrypoint, resetting cache' ) self.cache.clear() # clear cache #randomly select node from crawl_links random_index = random.randrange(0, len(crawl_links)) self.crawl_history.push({ 'href': self.current_uri, 'type': self.current_uri_type, 'title': self.current_uri_title }) self.current_uri = crawl_links[random_index]['href'] self.current_uri_type = crawl_links[random_index]['type'] self.current_uri_title = crawl_links[random_index]['title'] else: log.error( 'CRAWL: NO CRAWLABLE LINKS DETECTED AT ENTRY_POINT!!!!' ) return False #not at entry point, time to try and move back up in history try: prev = self.crawl_history.pop() self.current_uri = prev['href'] self.current_uri_type = prev['type'] self.current_uri_title = prev['title'] except: #no history left, not at entry point- jump to entry point log.info( 'CRAWL: crawling back up history, but exhausted history. Jump to entrypoint.' ) self.current_uri = self.entry_point self.current_uri_type = 'entry_point' self.current_uri_title = 'entry_point' log.debug( '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('CRAWL: crawling to %s : %s', self.current_uri_title.upper(), self.current_uri) log.info('CRAWL: type: %s', self.current_uri_type) log.debug( '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') #recurse return True
def crawl(self, namespace="", resource_type=None, \ plural_resource_type=None, resource_title=None, resource_extra=None): ''' crawl through chain, pushing uri/resource that match the passed criteria onto the queue. If nothing is passed, push all resources. Can match the resource_type. If you want a resource list (plural, i.e. lists of organizations resources NOT organization resources), you can specify that as the resource_type even though it is the plural. The code assumes the word can be pluralized by adding an 's' or 'es' to the end. If this is not true (i.e. Person -> People) please give the plural so the code can recognize when it has found a list of the singular resource of interest. if looking for a specific resource, this will cross check against the title of the resource. Selection will be ANDED with other query criteria. ''' #store search criteria in lowercase form, with namespace appended #add plural forms +'s', +'es' to list of plural cases to look for if resource_type is not None: #append namespace self.qry_resource_type = namespace + resource_type #make all lowercase self.qry_resource_type = self.qry_resource_type.lower() #'pluralize' resource after adding namespace self.qry_resource_plural = self.pluralize_resource_name( self.qry_resource_type) #add special pluralization if given by user if plural_resource_type is not None: self.qry_resource_plural.append(namespace + plural_resource_type) #make all plural list items lowercase self.qry_resource_plural = [ x.lower() for x in self.qry_resource_plural ] else: #not searching on resource_type, just define qry_resource_type as None self.qry_resource_type = None if resource_title is not None: #make all lowercase self.qry_resource_title = resource_title.lower() else: #not searching on title, just define qry_resource_title as None self.qry_resource_title = None if resource_extra is not None: self.qry_extra = resource_extra else: self.qry_extra = None #end initializing query variables loop_count = 0 #keep calling crawl_node, unless it returns false, with a pause between while (self.crawl_node()): #delay for crawl_delay ms between calls time.sleep(self.crawl_delay / 1000.0) #count loop iterations loop_count = loop_count + 1 log.info("MAIN CRAWL LOOP ITERATION %s -----------------", loop_count) log.info("--- crawling ended, %s pages crawled ---", loop_count) return self.found_resources
def push_uris_to_queue(self, uris): '''check uris against found_resources set, and if they're not there, get resource and push URI and resource out to queue''' #self.found_resources found_one = False for uri in uris: #if 'add' returns true, it's not in our set yet if self.found_resources.add(uri): log.info( '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' ) log.info( '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' ) log.info('New Resource Found! %s', uri) log.info( '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' ) log.info( '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' ) found_one = True #push uri and resource to queue! if isinstance(self.q, Queue.Queue): log.info('QUEUE: Pushing to queue') self.q.put(uri) elif self.zmq is not None: log.info('QUEUE: Pusing to ZMQ socket') self.zmq.send_string(uri) else: log.warn('QUEUE: Queue and ZMQ Socket undefined') return found_one