Python HarvestManUrlThreadPool Examples

Programming Language: Python

Namespace/Package Name: harvestman.lib.urlthread

Examples at hotexamples.com: 4

Python HarvestManUrlThreadPool - 4 examples found. These are the top rated real world Python examples of harvestman.lib.urlthread.HarvestManUrlThreadPool extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HarvestManUrlThreadPool(1)

spawn_threads(1)

Example #1

Show file

File: datamgr.py Project: vibhcool/harvestman-crawler

    def initialize(self):
        """ Do initializations per project """

        # Url thread group class for multithreaded downloads
        if self._cfg.usethreads:
            self._urlThreadPool = HarvestManUrlThreadPool()
            self._urlThreadPool.spawn_threads()
        else:
            self._urlThreadPool = None

        # URL database, a BST with disk-caching
        self._urldb = BST()
        # Collections database, a BST with disk-caching
        self.collections = BST()
        # For testing, don't set this otherwise we might
        # be left with many orphaned .bidx... folders!
        if not self._cfg.testing:
            self._urldb.set_auto(2)
            self.collections.set_auto(2)

        # Load any mirrors
        self.mirrormgr.load_mirrors(self._cfg.mirrorfile)
        # Set mirror search flag
        self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch

Example #2

Show file

File: datamgr.py Project: pombredanne/harvestman-crawler-2

    def initialize(self):
        """ Do initializations per project """

        # Url thread group class for multithreaded downloads
        if self._cfg.usethreads:
            self._urlThreadPool = HarvestManUrlThreadPool()
            self._urlThreadPool.spawn_threads()
        else:
            self._urlThreadPool = None

        # URL database, a BST with disk-caching
        self._urldb = BST()
        # Collections database, a BST with disk-caching        
        self.collections = BST()
        # For testing, don't set this otherwise we might
        # be left with many orphaned .bidx... folders!
        if not self._cfg.testing:
            self._urldb.set_auto(2)
            self.collections.set_auto(2)

        # Load any mirrors
        self.mirrormgr.load_mirrors(self._cfg.mirrorfile)
        # Set mirror search flag
        self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch

Example #3

Show file

File: datamgr.py Project: pombredanne/harvestman-crawler-2

class HarvestManDataManager(object):
    """ The data manager cum indexer class """

    # For supporting callbacks
    __metaclass__ = MethodWrapperMetaClass
    alias = 'datamgr'        

    def __init__(self):
        self.reset()

    def reset(self):
        # URLs which failed with any error
        self._numfailed = 0
        # URLs which failed even after a re-download
        self._numfailed2 = 0
        # URLs which were retried
        self._numretried = 0
        self.cache = None
        self.savedfiles = 0
        self.reposfiles = 0
        self.cachefiles = 0
        self.filteredfiles = 0
        # Config object
        self._cfg = objects.config
        # Dictionary of servers crawled and
        # their meta-data. Meta-data is
        # a dictionary which currently
        # has only one entry.
        # i.e accept-ranges.
        self._serversdict = {}
        # byte count
        self.bytes = 0L
        # saved bytes count
        self.savedbytes = 0L        
        # Redownload flag
        self._redownload = False
        # Mirror manager
        self.mirrormgr = HarvestManMirrorManager.getInstance()
        # Condition object for synchronization
        self.cond = threading.Condition(threading.Lock())        
        self._urldb = None
        self.collections = None

    def initialize(self):
        """ Do initializations per project """

        # Url thread group class for multithreaded downloads
        if self._cfg.usethreads:
            self._urlThreadPool = HarvestManUrlThreadPool()
            self._urlThreadPool.spawn_threads()
        else:
            self._urlThreadPool = None

        # URL database, a BST with disk-caching
        self._urldb = BST()
        # Collections database, a BST with disk-caching        
        self.collections = BST()
        # For testing, don't set this otherwise we might
        # be left with many orphaned .bidx... folders!
        if not self._cfg.testing:
            self._urldb.set_auto(2)
            self.collections.set_auto(2)

        # Load any mirrors
        self.mirrormgr.load_mirrors(self._cfg.mirrorfile)
        # Set mirror search flag
        self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch

    def get_urldb(self):
        return self._urldb
    
    def add_url(self, urlobj):
        """ Add urlobject urlobj to the local dictionary """

        # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index)
        self._urldb.insert(urlobj.index, urlobj)
        
    def update_url(self, urlobj):
        """ Update urlobject urlobj in the local dictionary """

        # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index)
        self._urldb.update(urlobj.index, urlobj)
        
    def get_url(self, index):

        # return self._urldict[str(index)]
        return self._urldb.lookup(index)

    def get_original_url(self, urlobj):

        # Return the original URL object for
        # duplicate URLs. This is useful for
        # processing URL objects obtained from
        # the collection object, because many
        # of them might be duplicate and would
        # not have any post-download information
        # such a headers etc.
        if urlobj.refindex != -1:
            return self.get_url(urlobj.refindex)
        else:
            # Return the same URL object to avoid
            # an <if None> check on the caller
            return urlobj
        
    def get_proj_cache_filename(self):
        """ Return the cache filename for the current project """

        # Note that this function does not actually build the cache directory.
        # Get the cache file path
        if self._cfg.projdir and self._cfg.project:
            cachedir = os.path.join(self._cfg.projdir, "hm-cache")
            cachefilename = os.path.join(cachedir, 'cache')

            return cachefilename
        else:
            return ''

    def get_proj_cache_directory(self):
        """ Return the cache directory for the current project """

        # Note that this function does not actually build the cache directory.
        # Get the cache file path
        if self._cfg.projdir and self._cfg.project:
            return os.path.join(self._cfg.projdir, "hm-cache")
        else:
            return ''        

    def get_server_dictionary(self):
        return self._serversdict

    def supports_range_requests(self, urlobj):
        """ Check whether the given url object
        supports range requests """

        # Look up its server in the dictionary
        server = urlobj.get_full_domain()
        if server in self._serversdict:
            d = self._serversdict[server]
            return d.get('accept-ranges', False)

        return False
        
    def read_project_cache(self):
        """ Try to read the project cache file """

        # Get cache filename
        info('Reading Project Cache...')
        cachereader = utils.HarvestManCacheReaderWriter(self.get_proj_cache_directory())
        obj, found = cachereader.read_project_cache()
        self._cfg.cachefound = found
        self.cache = obj
        if not found:
            # Fresh cache - create structure...
            self.cache.create('url','last_modified','etag', 'updated','location','checksum',
                              'content_length','data','headers')
            
            # Create an index on URL
            self.cache.create_index('url')
        else:
            pass

    def write_file_from_cache(self, urlobj):
        """ Write file from url cache. This
        works only if the cache dictionary of this
        url has a key named 'data' """

        ret = False

        # print 'Inside write_file_from_cache...'
        url = urlobj.get_full_url()
        content = self.cache._url[url]
        
        if len(content):
            # Value itself is a dictionary
            item = content[0]
            if not item.has_key('data'):
                return ret
            else:
                urldata = item['data']
                if urldata:
                    fileloc = item['location']                    
                    # Write file
                    extrainfo("Updating file from cache=>", fileloc)
                    try:
                        if SUCCESS(self.create_local_directory(os.path.dirname(fileloc))):
                            f=open(fileloc, 'wb')
                            f.write(zlib.decompress(urldata))
                            f.close()
                            ret = True
                    except (IOError, zlib.error), e:
                        error("Error:",e)
                                
        return ret

Example #4

Show file

File: datamgr.py Project: vibhcool/harvestman-crawler

class HarvestManDataManager(object):
    """ The data manager cum indexer class """

    # For supporting callbacks
    __metaclass__ = MethodWrapperMetaClass
    alias = 'datamgr'

    def __init__(self):
        self.reset()

    def reset(self):
        # URLs which failed with any error
        self._numfailed = 0
        # URLs which failed even after a re-download
        self._numfailed2 = 0
        # URLs which were retried
        self._numretried = 0
        self.cache = None
        self.savedfiles = 0
        self.reposfiles = 0
        self.cachefiles = 0
        self.filteredfiles = 0
        # Config object
        self._cfg = objects.config
        # Dictionary of servers crawled and
        # their meta-data. Meta-data is
        # a dictionary which currently
        # has only one entry.
        # i.e accept-ranges.
        self._serversdict = {}
        # byte count
        self.bytes = 0L
        # saved bytes count
        self.savedbytes = 0L
        # Redownload flag
        self._redownload = False
        # Mirror manager
        self.mirrormgr = HarvestManMirrorManager.getInstance()
        # Condition object for synchronization
        self.cond = threading.Condition(threading.Lock())
        self._urldb = None
        self.collections = None

    def initialize(self):
        """ Do initializations per project """

        # Url thread group class for multithreaded downloads
        if self._cfg.usethreads:
            self._urlThreadPool = HarvestManUrlThreadPool()
            self._urlThreadPool.spawn_threads()
        else:
            self._urlThreadPool = None

        # URL database, a BST with disk-caching
        self._urldb = BST()
        # Collections database, a BST with disk-caching
        self.collections = BST()
        # For testing, don't set this otherwise we might
        # be left with many orphaned .bidx... folders!
        if not self._cfg.testing:
            self._urldb.set_auto(2)
            self.collections.set_auto(2)

        # Load any mirrors
        self.mirrormgr.load_mirrors(self._cfg.mirrorfile)
        # Set mirror search flag
        self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch

    def get_urldb(self):
        return self._urldb

    def add_url(self, urlobj):
        """ Add urlobject urlobj to the local dictionary """

        # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index)
        self._urldb.insert(urlobj.index, urlobj)

    def update_url(self, urlobj):
        """ Update urlobject urlobj in the local dictionary """

        # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index)
        self._urldb.update(urlobj.index, urlobj)

    def get_url(self, index):

        # return self._urldict[str(index)]
        return self._urldb.lookup(index)

    def get_original_url(self, urlobj):

        # Return the original URL object for
        # duplicate URLs. This is useful for
        # processing URL objects obtained from
        # the collection object, because many
        # of them might be duplicate and would
        # not have any post-download information
        # such a headers etc.
        if urlobj.refindex != -1:
            return self.get_url(urlobj.refindex)
        else:
            # Return the same URL object to avoid
            # an <if None> check on the caller
            return urlobj

    def get_proj_cache_filename(self):
        """ Return the cache filename for the current project """

        # Note that this function does not actually build the cache directory.
        # Get the cache file path
        if self._cfg.projdir and self._cfg.project:
            cachedir = os.path.join(self._cfg.projdir, "hm-cache")
            cachefilename = os.path.join(cachedir, 'cache')

            return cachefilename
        else:
            return ''

    def get_proj_cache_directory(self):
        """ Return the cache directory for the current project """

        # Note that this function does not actually build the cache directory.
        # Get the cache file path
        if self._cfg.projdir and self._cfg.project:
            return os.path.join(self._cfg.projdir, "hm-cache")
        else:
            return ''

    def get_server_dictionary(self):
        return self._serversdict

    def supports_range_requests(self, urlobj):
        """ Check whether the given url object
        supports range requests """

        # Look up its server in the dictionary
        server = urlobj.get_full_domain()
        if server in self._serversdict:
            d = self._serversdict[server]
            return d.get('accept-ranges', False)

        return False

    def read_project_cache(self):
        """ Try to read the project cache file """

        # Get cache filename
        info('Reading Project Cache...')
        cachereader = utils.HarvestManCacheReaderWriter(
            self.get_proj_cache_directory())
        obj, found = cachereader.read_project_cache()
        self._cfg.cachefound = found
        self.cache = obj
        if not found:
            # Fresh cache - create structure...
            self.cache.create('url', 'last_modified', 'etag', 'updated',
                              'location', 'checksum', 'content_length', 'data',
                              'headers')

            # Create an index on URL
            self.cache.create_index('url')
        else:
            pass

    def write_file_from_cache(self, urlobj):
        """ Write file from url cache. This
        works only if the cache dictionary of this
        url has a key named 'data' """

        ret = False

        # print 'Inside write_file_from_cache...'
        url = urlobj.get_full_url()
        content = self.cache._url[url]

        if len(content):
            # Value itself is a dictionary
            item = content[0]
            if not item.has_key('data'):
                return ret
            else:
                urldata = item['data']
                if urldata:
                    fileloc = item['location']
                    # Write file
                    extrainfo("Updating file from cache=>", fileloc)
                    try:
                        if SUCCESS(
                                self.create_local_directory(
                                    os.path.dirname(fileloc))):
                            f = open(fileloc, 'wb')
                            f.write(zlib.decompress(urldata))
                            f.close()
                            ret = True
                    except (IOError, zlib.error), e:
                        error("Error:", e)

        return ret