def initialize(self): """ Do initializations per project """ # Url thread group class for multithreaded downloads if self._cfg.usethreads: self._urlThreadPool = HarvestManUrlThreadPool() self._urlThreadPool.spawn_threads() else: self._urlThreadPool = None # URL database, a BST with disk-caching self._urldb = BST() # Collections database, a BST with disk-caching self.collections = BST() # For testing, don't set this otherwise we might # be left with many orphaned .bidx... folders! if not self._cfg.testing: self._urldb.set_auto(2) self.collections.set_auto(2) # Load any mirrors self.mirrormgr.load_mirrors(self._cfg.mirrorfile) # Set mirror search flag self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch
class HarvestManDataManager(object): """ The data manager cum indexer class """ # For supporting callbacks __metaclass__ = MethodWrapperMetaClass alias = 'datamgr' def __init__(self): self.reset() def reset(self): # URLs which failed with any error self._numfailed = 0 # URLs which failed even after a re-download self._numfailed2 = 0 # URLs which were retried self._numretried = 0 self.cache = None self.savedfiles = 0 self.reposfiles = 0 self.cachefiles = 0 self.filteredfiles = 0 # Config object self._cfg = objects.config # Dictionary of servers crawled and # their meta-data. Meta-data is # a dictionary which currently # has only one entry. # i.e accept-ranges. self._serversdict = {} # byte count self.bytes = 0L # saved bytes count self.savedbytes = 0L # Redownload flag self._redownload = False # Mirror manager self.mirrormgr = HarvestManMirrorManager.getInstance() # Condition object for synchronization self.cond = threading.Condition(threading.Lock()) self._urldb = None self.collections = None def initialize(self): """ Do initializations per project """ # Url thread group class for multithreaded downloads if self._cfg.usethreads: self._urlThreadPool = HarvestManUrlThreadPool() self._urlThreadPool.spawn_threads() else: self._urlThreadPool = None # URL database, a BST with disk-caching self._urldb = BST() # Collections database, a BST with disk-caching self.collections = BST() # For testing, don't set this otherwise we might # be left with many orphaned .bidx... folders! if not self._cfg.testing: self._urldb.set_auto(2) self.collections.set_auto(2) # Load any mirrors self.mirrormgr.load_mirrors(self._cfg.mirrorfile) # Set mirror search flag self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch def get_urldb(self): return self._urldb def add_url(self, urlobj): """ Add urlobject urlobj to the local dictionary """ # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index) self._urldb.insert(urlobj.index, urlobj) def update_url(self, urlobj): """ Update urlobject urlobj in the local dictionary """ # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index) self._urldb.update(urlobj.index, urlobj) def get_url(self, index): # return self._urldict[str(index)] return self._urldb.lookup(index) def get_original_url(self, urlobj): # Return the original URL object for # duplicate URLs. This is useful for # processing URL objects obtained from # the collection object, because many # of them might be duplicate and would # not have any post-download information # such a headers etc. if urlobj.refindex != -1: return self.get_url(urlobj.refindex) else: # Return the same URL object to avoid # an <if None> check on the caller return urlobj def get_proj_cache_filename(self): """ Return the cache filename for the current project """ # Note that this function does not actually build the cache directory. # Get the cache file path if self._cfg.projdir and self._cfg.project: cachedir = os.path.join(self._cfg.projdir, "hm-cache") cachefilename = os.path.join(cachedir, 'cache') return cachefilename else: return '' def get_proj_cache_directory(self): """ Return the cache directory for the current project """ # Note that this function does not actually build the cache directory. # Get the cache file path if self._cfg.projdir and self._cfg.project: return os.path.join(self._cfg.projdir, "hm-cache") else: return '' def get_server_dictionary(self): return self._serversdict def supports_range_requests(self, urlobj): """ Check whether the given url object supports range requests """ # Look up its server in the dictionary server = urlobj.get_full_domain() if server in self._serversdict: d = self._serversdict[server] return d.get('accept-ranges', False) return False def read_project_cache(self): """ Try to read the project cache file """ # Get cache filename info('Reading Project Cache...') cachereader = utils.HarvestManCacheReaderWriter(self.get_proj_cache_directory()) obj, found = cachereader.read_project_cache() self._cfg.cachefound = found self.cache = obj if not found: # Fresh cache - create structure... self.cache.create('url','last_modified','etag', 'updated','location','checksum', 'content_length','data','headers') # Create an index on URL self.cache.create_index('url') else: pass def write_file_from_cache(self, urlobj): """ Write file from url cache. This works only if the cache dictionary of this url has a key named 'data' """ ret = False # print 'Inside write_file_from_cache...' url = urlobj.get_full_url() content = self.cache._url[url] if len(content): # Value itself is a dictionary item = content[0] if not item.has_key('data'): return ret else: urldata = item['data'] if urldata: fileloc = item['location'] # Write file extrainfo("Updating file from cache=>", fileloc) try: if SUCCESS(self.create_local_directory(os.path.dirname(fileloc))): f=open(fileloc, 'wb') f.write(zlib.decompress(urldata)) f.close() ret = True except (IOError, zlib.error), e: error("Error:",e) return ret
class HarvestManDataManager(object): """ The data manager cum indexer class """ # For supporting callbacks __metaclass__ = MethodWrapperMetaClass alias = 'datamgr' def __init__(self): self.reset() def reset(self): # URLs which failed with any error self._numfailed = 0 # URLs which failed even after a re-download self._numfailed2 = 0 # URLs which were retried self._numretried = 0 self.cache = None self.savedfiles = 0 self.reposfiles = 0 self.cachefiles = 0 self.filteredfiles = 0 # Config object self._cfg = objects.config # Dictionary of servers crawled and # their meta-data. Meta-data is # a dictionary which currently # has only one entry. # i.e accept-ranges. self._serversdict = {} # byte count self.bytes = 0L # saved bytes count self.savedbytes = 0L # Redownload flag self._redownload = False # Mirror manager self.mirrormgr = HarvestManMirrorManager.getInstance() # Condition object for synchronization self.cond = threading.Condition(threading.Lock()) self._urldb = None self.collections = None def initialize(self): """ Do initializations per project """ # Url thread group class for multithreaded downloads if self._cfg.usethreads: self._urlThreadPool = HarvestManUrlThreadPool() self._urlThreadPool.spawn_threads() else: self._urlThreadPool = None # URL database, a BST with disk-caching self._urldb = BST() # Collections database, a BST with disk-caching self.collections = BST() # For testing, don't set this otherwise we might # be left with many orphaned .bidx... folders! if not self._cfg.testing: self._urldb.set_auto(2) self.collections.set_auto(2) # Load any mirrors self.mirrormgr.load_mirrors(self._cfg.mirrorfile) # Set mirror search flag self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch def get_urldb(self): return self._urldb def add_url(self, urlobj): """ Add urlobject urlobj to the local dictionary """ # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index) self._urldb.insert(urlobj.index, urlobj) def update_url(self, urlobj): """ Update urlobject urlobj in the local dictionary """ # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index) self._urldb.update(urlobj.index, urlobj) def get_url(self, index): # return self._urldict[str(index)] return self._urldb.lookup(index) def get_original_url(self, urlobj): # Return the original URL object for # duplicate URLs. This is useful for # processing URL objects obtained from # the collection object, because many # of them might be duplicate and would # not have any post-download information # such a headers etc. if urlobj.refindex != -1: return self.get_url(urlobj.refindex) else: # Return the same URL object to avoid # an <if None> check on the caller return urlobj def get_proj_cache_filename(self): """ Return the cache filename for the current project """ # Note that this function does not actually build the cache directory. # Get the cache file path if self._cfg.projdir and self._cfg.project: cachedir = os.path.join(self._cfg.projdir, "hm-cache") cachefilename = os.path.join(cachedir, 'cache') return cachefilename else: return '' def get_proj_cache_directory(self): """ Return the cache directory for the current project """ # Note that this function does not actually build the cache directory. # Get the cache file path if self._cfg.projdir and self._cfg.project: return os.path.join(self._cfg.projdir, "hm-cache") else: return '' def get_server_dictionary(self): return self._serversdict def supports_range_requests(self, urlobj): """ Check whether the given url object supports range requests """ # Look up its server in the dictionary server = urlobj.get_full_domain() if server in self._serversdict: d = self._serversdict[server] return d.get('accept-ranges', False) return False def read_project_cache(self): """ Try to read the project cache file """ # Get cache filename info('Reading Project Cache...') cachereader = utils.HarvestManCacheReaderWriter( self.get_proj_cache_directory()) obj, found = cachereader.read_project_cache() self._cfg.cachefound = found self.cache = obj if not found: # Fresh cache - create structure... self.cache.create('url', 'last_modified', 'etag', 'updated', 'location', 'checksum', 'content_length', 'data', 'headers') # Create an index on URL self.cache.create_index('url') else: pass def write_file_from_cache(self, urlobj): """ Write file from url cache. This works only if the cache dictionary of this url has a key named 'data' """ ret = False # print 'Inside write_file_from_cache...' url = urlobj.get_full_url() content = self.cache._url[url] if len(content): # Value itself is a dictionary item = content[0] if not item.has_key('data'): return ret else: urldata = item['data'] if urldata: fileloc = item['location'] # Write file extrainfo("Updating file from cache=>", fileloc) try: if SUCCESS( self.create_local_directory( os.path.dirname(fileloc))): f = open(fileloc, 'wb') f.write(zlib.decompress(urldata)) f.close() ret = True except (IOError, zlib.error), e: error("Error:", e) return ret