Example #1
0
    def initialize(self):
        """ Do initializations per project """

        # Url thread group class for multithreaded downloads
        if self._cfg.usethreads:
            self._urlThreadPool = HarvestManUrlThreadPool()
            self._urlThreadPool.spawn_threads()
        else:
            self._urlThreadPool = None

        # URL database, a BST with disk-caching
        self._urldb = BST()
        # Collections database, a BST with disk-caching
        self.collections = BST()
        # For testing, don't set this otherwise we might
        # be left with many orphaned .bidx... folders!
        if not self._cfg.testing:
            self._urldb.set_auto(2)
            self.collections.set_auto(2)

        # Load any mirrors
        self.mirrormgr.load_mirrors(self._cfg.mirrorfile)
        # Set mirror search flag
        self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch
    def initialize(self):
        """ Do initializations per project """

        # Url thread group class for multithreaded downloads
        if self._cfg.usethreads:
            self._urlThreadPool = HarvestManUrlThreadPool()
            self._urlThreadPool.spawn_threads()
        else:
            self._urlThreadPool = None

        # URL database, a BST with disk-caching
        self._urldb = BST()
        # Collections database, a BST with disk-caching        
        self.collections = BST()
        # For testing, don't set this otherwise we might
        # be left with many orphaned .bidx... folders!
        if not self._cfg.testing:
            self._urldb.set_auto(2)
            self.collections.set_auto(2)

        # Load any mirrors
        self.mirrormgr.load_mirrors(self._cfg.mirrorfile)
        # Set mirror search flag
        self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch
class HarvestManDataManager(object):
    """ The data manager cum indexer class """

    # For supporting callbacks
    __metaclass__ = MethodWrapperMetaClass
    alias = 'datamgr'        

    def __init__(self):
        self.reset()

    def reset(self):
        # URLs which failed with any error
        self._numfailed = 0
        # URLs which failed even after a re-download
        self._numfailed2 = 0
        # URLs which were retried
        self._numretried = 0
        self.cache = None
        self.savedfiles = 0
        self.reposfiles = 0
        self.cachefiles = 0
        self.filteredfiles = 0
        # Config object
        self._cfg = objects.config
        # Dictionary of servers crawled and
        # their meta-data. Meta-data is
        # a dictionary which currently
        # has only one entry.
        # i.e accept-ranges.
        self._serversdict = {}
        # byte count
        self.bytes = 0L
        # saved bytes count
        self.savedbytes = 0L        
        # Redownload flag
        self._redownload = False
        # Mirror manager
        self.mirrormgr = HarvestManMirrorManager.getInstance()
        # Condition object for synchronization
        self.cond = threading.Condition(threading.Lock())        
        self._urldb = None
        self.collections = None

    def initialize(self):
        """ Do initializations per project """

        # Url thread group class for multithreaded downloads
        if self._cfg.usethreads:
            self._urlThreadPool = HarvestManUrlThreadPool()
            self._urlThreadPool.spawn_threads()
        else:
            self._urlThreadPool = None

        # URL database, a BST with disk-caching
        self._urldb = BST()
        # Collections database, a BST with disk-caching        
        self.collections = BST()
        # For testing, don't set this otherwise we might
        # be left with many orphaned .bidx... folders!
        if not self._cfg.testing:
            self._urldb.set_auto(2)
            self.collections.set_auto(2)

        # Load any mirrors
        self.mirrormgr.load_mirrors(self._cfg.mirrorfile)
        # Set mirror search flag
        self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch

    def get_urldb(self):
        return self._urldb
    
    def add_url(self, urlobj):
        """ Add urlobject urlobj to the local dictionary """

        # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index)
        self._urldb.insert(urlobj.index, urlobj)
        
    def update_url(self, urlobj):
        """ Update urlobject urlobj in the local dictionary """

        # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index)
        self._urldb.update(urlobj.index, urlobj)
        
    def get_url(self, index):

        # return self._urldict[str(index)]
        return self._urldb.lookup(index)

    def get_original_url(self, urlobj):

        # Return the original URL object for
        # duplicate URLs. This is useful for
        # processing URL objects obtained from
        # the collection object, because many
        # of them might be duplicate and would
        # not have any post-download information
        # such a headers etc.
        if urlobj.refindex != -1:
            return self.get_url(urlobj.refindex)
        else:
            # Return the same URL object to avoid
            # an <if None> check on the caller
            return urlobj
        
    def get_proj_cache_filename(self):
        """ Return the cache filename for the current project """

        # Note that this function does not actually build the cache directory.
        # Get the cache file path
        if self._cfg.projdir and self._cfg.project:
            cachedir = os.path.join(self._cfg.projdir, "hm-cache")
            cachefilename = os.path.join(cachedir, 'cache')

            return cachefilename
        else:
            return ''

    def get_proj_cache_directory(self):
        """ Return the cache directory for the current project """

        # Note that this function does not actually build the cache directory.
        # Get the cache file path
        if self._cfg.projdir and self._cfg.project:
            return os.path.join(self._cfg.projdir, "hm-cache")
        else:
            return ''        

    def get_server_dictionary(self):
        return self._serversdict

    def supports_range_requests(self, urlobj):
        """ Check whether the given url object
        supports range requests """

        # Look up its server in the dictionary
        server = urlobj.get_full_domain()
        if server in self._serversdict:
            d = self._serversdict[server]
            return d.get('accept-ranges', False)

        return False
        
    def read_project_cache(self):
        """ Try to read the project cache file """

        # Get cache filename
        info('Reading Project Cache...')
        cachereader = utils.HarvestManCacheReaderWriter(self.get_proj_cache_directory())
        obj, found = cachereader.read_project_cache()
        self._cfg.cachefound = found
        self.cache = obj
        if not found:
            # Fresh cache - create structure...
            self.cache.create('url','last_modified','etag', 'updated','location','checksum',
                              'content_length','data','headers')
            
            # Create an index on URL
            self.cache.create_index('url')
        else:
            pass

    def write_file_from_cache(self, urlobj):
        """ Write file from url cache. This
        works only if the cache dictionary of this
        url has a key named 'data' """

        ret = False

        # print 'Inside write_file_from_cache...'
        url = urlobj.get_full_url()
        content = self.cache._url[url]
        
        if len(content):
            # Value itself is a dictionary
            item = content[0]
            if not item.has_key('data'):
                return ret
            else:
                urldata = item['data']
                if urldata:
                    fileloc = item['location']                    
                    # Write file
                    extrainfo("Updating file from cache=>", fileloc)
                    try:
                        if SUCCESS(self.create_local_directory(os.path.dirname(fileloc))):
                            f=open(fileloc, 'wb')
                            f.write(zlib.decompress(urldata))
                            f.close()
                            ret = True
                    except (IOError, zlib.error), e:
                        error("Error:",e)
                                
        return ret
Example #4
0
class HarvestManDataManager(object):
    """ The data manager cum indexer class """

    # For supporting callbacks
    __metaclass__ = MethodWrapperMetaClass
    alias = 'datamgr'

    def __init__(self):
        self.reset()

    def reset(self):
        # URLs which failed with any error
        self._numfailed = 0
        # URLs which failed even after a re-download
        self._numfailed2 = 0
        # URLs which were retried
        self._numretried = 0
        self.cache = None
        self.savedfiles = 0
        self.reposfiles = 0
        self.cachefiles = 0
        self.filteredfiles = 0
        # Config object
        self._cfg = objects.config
        # Dictionary of servers crawled and
        # their meta-data. Meta-data is
        # a dictionary which currently
        # has only one entry.
        # i.e accept-ranges.
        self._serversdict = {}
        # byte count
        self.bytes = 0L
        # saved bytes count
        self.savedbytes = 0L
        # Redownload flag
        self._redownload = False
        # Mirror manager
        self.mirrormgr = HarvestManMirrorManager.getInstance()
        # Condition object for synchronization
        self.cond = threading.Condition(threading.Lock())
        self._urldb = None
        self.collections = None

    def initialize(self):
        """ Do initializations per project """

        # Url thread group class for multithreaded downloads
        if self._cfg.usethreads:
            self._urlThreadPool = HarvestManUrlThreadPool()
            self._urlThreadPool.spawn_threads()
        else:
            self._urlThreadPool = None

        # URL database, a BST with disk-caching
        self._urldb = BST()
        # Collections database, a BST with disk-caching
        self.collections = BST()
        # For testing, don't set this otherwise we might
        # be left with many orphaned .bidx... folders!
        if not self._cfg.testing:
            self._urldb.set_auto(2)
            self.collections.set_auto(2)

        # Load any mirrors
        self.mirrormgr.load_mirrors(self._cfg.mirrorfile)
        # Set mirror search flag
        self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch

    def get_urldb(self):
        return self._urldb

    def add_url(self, urlobj):
        """ Add urlobject urlobj to the local dictionary """

        # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index)
        self._urldb.insert(urlobj.index, urlobj)

    def update_url(self, urlobj):
        """ Update urlobject urlobj in the local dictionary """

        # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index)
        self._urldb.update(urlobj.index, urlobj)

    def get_url(self, index):

        # return self._urldict[str(index)]
        return self._urldb.lookup(index)

    def get_original_url(self, urlobj):

        # Return the original URL object for
        # duplicate URLs. This is useful for
        # processing URL objects obtained from
        # the collection object, because many
        # of them might be duplicate and would
        # not have any post-download information
        # such a headers etc.
        if urlobj.refindex != -1:
            return self.get_url(urlobj.refindex)
        else:
            # Return the same URL object to avoid
            # an <if None> check on the caller
            return urlobj

    def get_proj_cache_filename(self):
        """ Return the cache filename for the current project """

        # Note that this function does not actually build the cache directory.
        # Get the cache file path
        if self._cfg.projdir and self._cfg.project:
            cachedir = os.path.join(self._cfg.projdir, "hm-cache")
            cachefilename = os.path.join(cachedir, 'cache')

            return cachefilename
        else:
            return ''

    def get_proj_cache_directory(self):
        """ Return the cache directory for the current project """

        # Note that this function does not actually build the cache directory.
        # Get the cache file path
        if self._cfg.projdir and self._cfg.project:
            return os.path.join(self._cfg.projdir, "hm-cache")
        else:
            return ''

    def get_server_dictionary(self):
        return self._serversdict

    def supports_range_requests(self, urlobj):
        """ Check whether the given url object
        supports range requests """

        # Look up its server in the dictionary
        server = urlobj.get_full_domain()
        if server in self._serversdict:
            d = self._serversdict[server]
            return d.get('accept-ranges', False)

        return False

    def read_project_cache(self):
        """ Try to read the project cache file """

        # Get cache filename
        info('Reading Project Cache...')
        cachereader = utils.HarvestManCacheReaderWriter(
            self.get_proj_cache_directory())
        obj, found = cachereader.read_project_cache()
        self._cfg.cachefound = found
        self.cache = obj
        if not found:
            # Fresh cache - create structure...
            self.cache.create('url', 'last_modified', 'etag', 'updated',
                              'location', 'checksum', 'content_length', 'data',
                              'headers')

            # Create an index on URL
            self.cache.create_index('url')
        else:
            pass

    def write_file_from_cache(self, urlobj):
        """ Write file from url cache. This
        works only if the cache dictionary of this
        url has a key named 'data' """

        ret = False

        # print 'Inside write_file_from_cache...'
        url = urlobj.get_full_url()
        content = self.cache._url[url]

        if len(content):
            # Value itself is a dictionary
            item = content[0]
            if not item.has_key('data'):
                return ret
            else:
                urldata = item['data']
                if urldata:
                    fileloc = item['location']
                    # Write file
                    extrainfo("Updating file from cache=>", fileloc)
                    try:
                        if SUCCESS(
                                self.create_local_directory(
                                    os.path.dirname(fileloc))):
                            f = open(fileloc, 'wb')
                            f.write(zlib.decompress(urldata))
                            f.close()
                            ret = True
                    except (IOError, zlib.error), e:
                        error("Error:", e)

        return ret