Ejemplo n.º 1
0
    def testObjectKeyGeneration(self):
        ''' ensures that the diskcache object's location does not change '''
        CACHE_DIR = get_cache_dir(3)
        d = DiskCache(CACHE_DIR)
        getCacheLocation = lambda x: join(CACHE_DIR, Cache.getObjectId(x))

        d.fetchObjectId(1, str, 1)
        assert exists(getCacheLocation(1))

        d.fetch(str, 2)
        assert exists(getCacheLocation(((2, ), ())))
Ejemplo n.º 2
0
 def testObjectKeyGeneration(self):
     ''' ensures that the diskcache object's location does not change '''   
     CACHE_DIR = get_cache_dir(3)
     d = DiskCache(CACHE_DIR)
     getCacheLocation = lambda x: join(CACHE_DIR, Cache.getObjectId(x))
     
     d.fetchObjectId(1, str, 1)
     assert exists( getCacheLocation(1) )
     
     d.fetch(str, 2)
     assert exists( getCacheLocation( ((2,), ()) ))
Ejemplo n.º 3
0
class Calais(object):
    submitter = USER_AGENT % "Calais"
    allow_distro = "false"
    allow_search = "false"
    api_key = ""

    def __init__(self,
                 submitter,
                 api_key=OPENCALAIS_KEY,
                 allow_distro="false",
                 allow_search="false",
                 cache_dir=OPENCALAIS_CACHE_DIR):
        """
        Creates a new handler for communicating with OpenCalais.  
                The parameter 'submitter' must contain a string, identifying your application.  
                'api_key' must contain a string with your OpenCalais API key (get it here: http://developer.opencalais.com/apps/register).  
        The optional parameter 'allow_distro', if set to 'true' gives OpenCalais permission to distribute the metadata extracted from your submissions.  The default value for 'allow_distro' is 'false'.  
        The optional parameter 'allow_search', if set to 'true' tells OpenCalais that future searches can be performed on the extracted metadata.  The default value for 'allow_search' is 'false'.  
        """
        assert (api_key)
        self.submitter = submitter
        self.allow_distro = "false"
        self.allow_search = "false"
        self.api_key = api_key
        if cache_dir:
            self.cache = DiskCache(cache_dir,
                                   cache_nesting_level=2,
                                   cache_file_suffix=".xml")

    @staticmethod
    def random_id(self):
        """
        Creates a random 10-character ID for your submission.  
        """
        chars = str.letters + str.digits
        return "".join([choice(chars) for i in xrange(10)])

    @staticmethod
    def content_id(text):
        """
        Creates a SHA1 hash of the text of your submission.  
        """
        try:
            import hashlib
            h = hashlib.sha1()
        except ImportError:
            import sha
            h = sha.new()

        h.update(text)
        return h.hexdigest()

    def analyze(self, text, content_type="text/txt"):
        """ Submits 'text' to OpenCalais for analysis and memorizes the extracted metadata. 
            Set the content-type to 'text/html' if you are submitting HTML data.  
        """
        externalID = self.content_id(text)
        paramsXML = PARAMS_XML % (content_type, self.allow_distro,
                                  self.allow_search, externalID,
                                  self.submitter)
        param = urlencode({
            'licenseID': self.api_key,
            'content': text,
            'paramsXML': paramsXML
        })

        # do not fetch the data again, if a file exists in the cache
        get_calais_data = lambda x: Retrieve(Calais.__name__).open(
            OPENCALAIS_URL, x).read()

        if self.cache is None:
            xml_data = self.unpack(get_calais_data(param))
        else:
            xml_data = self.unpack(self.cache.fetch(get_calais_data, param))

        return self.parse(xml_data)

    @staticmethod
    def unpack(calais_data):
        """ extracts calais' xml response from the data send by the calais 
            webservice 
        """
        dom = minidom.parseString(calais_data)
        return """<?xml version="1.0" encoding="utf-8"?>\n""" \
                 + dom.getElementsByTagName("string")[0].firstChild.data

    @staticmethod
    def cleanup_xml(xml_data):
        """ removes comments from xml-data-streams provided by opencalais
            @param[in] xml_data 
            @returns the xml data without any comments
        """
        result = []
        comment = False

        while '<!--' in xml_data:

            xml_data = re.sub('<!--[\s\S]*?-->', '', xml_data)
            if not re.search('<!--', xml_data):
                break

        return xml_data

    @staticmethod
    def parse(xml_data):
        """ parses opencalai's xml output and returns it's dictionary representation """

        things = []

        xml_data = Calais.cleanup_xml(xml_data)

        # f= open("tmp","w"); f.write(xml_data.encode("utf8")); f.close()
        dom = minidom.parseString(xml_data.encode("utf8"))

        for document in dom.getElementsByTagName("CalaisSimpleOutputFormat"):
            for annotations in document.childNodes:
                if not annotations.hasChildNodes():
                    continue

                if annotations.nodeName == 'Topics':
                    annotations = annotations.firstChild

                nodeName = annotations.nodeName
                nodeAttr = dict(annotations.attributes.items())

                nodeAttr.update({'data': annotations.firstChild.data})

                things.append({nodeName: nodeAttr})

        return things
Ejemplo n.º 4
0
class SkipTestDiskCached(TestCached):
    @staticmethod
    @DiskCached(get_cache_dir(1))
    def add(a=1, b=2):
        return a + b

    @staticmethod
    @DiskCached(get_cache_dir(2))
    def sub(a, b):
        return a - b

    def setUp(self):
        self.diskCache = DiskCache(get_cache_dir(4))

    def tearDown(self):
        ''' remove the cache directories '''
        for cacheDirNo in range(10):
            if exists(get_cache_dir(cacheDirNo)):
                rmtree(get_cache_dir(cacheDirNo))

    def testObjectKeyGeneration(self):
        ''' ensures that the diskcache object's location does not change '''
        CACHE_DIR = get_cache_dir(3)
        d = DiskCache(CACHE_DIR)
        getCacheLocation = lambda x: join(CACHE_DIR, Cache.getObjectId(x))

        d.fetchObjectId(1, str, 1)
        assert exists(getCacheLocation(1))

        d.fetch(str, 2)
        assert exists(getCacheLocation(((2, ), ())))

    def testContains(self):
        ''' verifies that 'key' in cache works '''
        # diskcache
        assert self.diskCache.fetchObjectId(1, str, 1) == "1"

        assert 1 in self.diskCache
        assert 2 not in self.diskCache

        # diskcached
        assert self.add(12, 14) == 26
        assert self.add.getKey(12, 14) in self.add
        assert 9 not in self.add

    def testDelItem(self):
        ''' verifies that delitem works '''
        # diskcache
        assert self.diskCache.fetch(str, 2) == "2"
        key = self.diskCache.getKey(2)
        assert key in self.diskCache
        del self.diskCache[key]
        assert key not in self.diskCache

        # diskcached
        assert self.add(12, 13) == 25
        key = self.add.getKey(12, 13)
        assert key == ((12, 13), ())
        assert key in self.add
        del self.add[key]
        assert key not in self.add

    def testDirectCall(self):
        ''' tests directly calling the cache object using __call__ '''
        CACHE_DIR = get_cache_dir(4)
        cached_str = DiskCache(CACHE_DIR, fn=str)

        assert cached_str(7) == "7"
        assert cached_str.getKey(7) in cached_str

    def testIterableCache(self):
        ''' tests the iterable cache '''
        CACHE_DIR = get_cache_dir(5)
        i = IterableCache(CACHE_DIR)

        getTestIterator = lambda x: list(range(x))

        for iteratorSize in (4, 5, 6):
            cachedIterator = i.fetch(getTestIterator, iteratorSize)

            for x, y in zip(cachedIterator, getTestIterator(iteratorSize)):
                assert x == y

    @pytest.mark.slow
    def testThreadSafety(self):
        '''  tests whether everything is thread safe '''

        for a in range(1000):
            c = DiskCache(get_cache_dir(6))
            p = Pool(12)

            p.map(f, 60 * [c])
            p.map(g, 60 * [c])

            p.close()
            p.join()
Ejemplo n.º 5
0
class Calais:
    submitter = USER_AGENT % "Calais"
    allow_distro = "false"
    allow_search = "false" 
    api_key = ""

    def __init__(self, submitter, api_key=OPENCALAIS_KEY, allow_distro="false", allow_search="false", cache_dir=OPENCALAIS_CACHE_DIR):
        """
        Creates a new handler for communicating with OpenCalais.  
                The parameter 'submitter' must contain a string, identifying your application.  
                'api_key' must contain a string with your OpenCalais API key (get it here: http://developer.opencalais.com/apps/register).  
        The optional parameter 'allow_distro', if set to 'true' gives OpenCalais permission to distribute the metadata extracted from your submissions.  The default value for 'allow_distro' is 'false'.  
        The optional parameter 'allow_search', if set to 'true' tells OpenCalais that future searches can be performed on the extracted metadata.  The default value for 'allow_search' is 'false'.  
        """
        assert(api_key) 
        self.submitter = submitter
        self.allow_distro = "false"
        self.allow_search = "false"
        self.api_key = api_key
        if cache_dir:
            self.cache  = DiskCache(cache_dir, cache_nesting_level=2, cache_file_suffix=".xml")

    @staticmethod
    def random_id(self):
        """
        Creates a random 10-character ID for your submission.  
        """
        chars = str.letters + str.digits
        return "".join( [ choice(chars) for i in xrange(10) ] )
    

    @staticmethod
    def content_id(text):
        """
        Creates a SHA1 hash of the text of your submission.  
        """
        try:
            import hashlib
            h = hashlib.sha1()
        except ImportError:
            import sha
            h = sha.new()

        h.update(text)
        return h.hexdigest()


    def analyze(self, text, content_type="text/txt"): 
        """ Submits 'text' to OpenCalais for analysis and memorizes the extracted metadata. 
            Set the content-type to 'text/html' if you are submitting HTML data.  
        """
        externalID = self.content_id( text )
        paramsXML = PARAMS_XML % (content_type, self.allow_distro, self.allow_search, externalID, self.submitter) 
        param = urlencode({'licenseID':self.api_key, 'content':text, 'paramsXML':paramsXML}) 
                
        # do not fetch the data again, if a file exists in the cache
        get_calais_data = lambda x: Retrieve(Calais.__name__).open(OPENCALAIS_URL, x).read()

        if self.cache is None:
            xml_data = self.unpack( get_calais_data( param ) )
        else:
            xml_data = self.unpack( self.cache.fetch( get_calais_data, param ) )

        return self.parse( xml_data )


    @staticmethod
    def unpack(calais_data):
        """ extracts calais' xml response from the data send by the calais 
            webservice 
        """
        dom = minidom.parseString(calais_data)
        return """<?xml version="1.0" encoding="utf-8"?>\n""" \
                 + dom.getElementsByTagName("string")[0].firstChild.data

    @staticmethod
    def cleanup_xml(xml_data):
        """ removes comments from xml-data-streams provided by opencalais
            @param[in] xml_data 
            @returns the xml data without any comments
        """
        result = []
        comment = False

        while '<!--' in xml_data:

            xml_data = re.sub('<!--[\s\S]*?-->', '', xml_data)
            if not re.search('<!--', xml_data):
                break
            
        return xml_data


    @staticmethod
    def parse(xml_data): 
        """ parses opencalai's xml output and returns it's dictionary representation """

        things = []

        xml_data = Calais.cleanup_xml(xml_data)

        # f= open("tmp","w"); f.write(xml_data.encode("utf8")); f.close()
        dom = minidom.parseString( xml_data.encode("utf8" ))
        
        for document in dom.getElementsByTagName("CalaisSimpleOutputFormat"):
            for annotations in document.childNodes:
                if not annotations.hasChildNodes():
                    continue
                
                if annotations.nodeName == 'Topics':
                    annotations = annotations.firstChild
                
                nodeName = annotations.nodeName
                nodeAttr = dict(annotations.attributes.items())
        
                nodeAttr.update( {'data': annotations.firstChild.data } )

                things.append( {nodeName: nodeAttr } )

        return things
Ejemplo n.º 6
0
class SkipTestDiskCached(TestCached):
    @staticmethod
    @DiskCached(get_cache_dir(1))
    def add(a=1, b=2):
        return a+b

    @staticmethod
    @DiskCached(get_cache_dir(2))
    def sub(a, b):
        return a-b 
    
    def setUp(self):
        self.diskCache = DiskCache(get_cache_dir(4))

    def tearDown(self):
        ''' remove the cache directories '''
        for cacheDirNo in range(10):
            if exists(get_cache_dir(cacheDirNo)):
                rmtree(get_cache_dir(cacheDirNo))
        
    def testObjectKeyGeneration(self):
        ''' ensures that the diskcache object's location does not change '''   
        CACHE_DIR = get_cache_dir(3)
        d = DiskCache(CACHE_DIR)
        getCacheLocation = lambda x: join(CACHE_DIR, Cache.getObjectId(x))
        
        d.fetchObjectId(1, str, 1)
        assert exists( getCacheLocation(1) )
        
        d.fetch(str, 2)
        assert exists( getCacheLocation( ((2,), ()) ))

    def testContains(self):
        ''' verifies that 'key' in cache works '''
        # diskcache
        assert self.diskCache.fetchObjectId(1, str, 1 ) == "1"
        
        assert 1 in self.diskCache
        assert 2 not in self.diskCache
        
        # diskcached
        assert self.add(12,14) == 26
        assert self.add.getKey(12,14) in self.add
        assert 9 not in self.add
        
    def testDelItem(self):
        ''' verifies that delitem works '''
        # diskcache
        assert self.diskCache.fetch(str, 2) == "2"
        key = self.diskCache.getKey(2)
        assert key in self.diskCache
        del self.diskCache[key]
        assert key not in self.diskCache

        # diskcached
        assert self.add(12,13) == 25
        key = self.add.getKey(12, 13)
        assert key == ((12, 13), ())
        assert key in self.add
        del self.add[key]
        assert key not in self.add     
        
    def testDirectCall(self):
        ''' tests directly calling the cache object using __call__ '''
        CACHE_DIR = get_cache_dir(4)
        cached_str = DiskCache(CACHE_DIR, fn=str)
        
        assert cached_str(7) == "7"
        assert cached_str.getKey(7) in cached_str

            
    def testIterableCache(self):
        ''' tests the iterable cache '''
        CACHE_DIR = get_cache_dir(5)
        i = IterableCache(CACHE_DIR)

        getTestIterator = lambda x: range(x)

        for iteratorSize in (4, 5, 6):
            cachedIterator = i.fetch( getTestIterator, iteratorSize )
            
            for x,y in zip(cachedIterator, getTestIterator(iteratorSize)):
                assert x == y

    @pytest.mark.slow
    def testThreadSafety(self):
        '''  tests whether everything is thread safe '''

        for a in range(1000):
            c = DiskCache(get_cache_dir(6))
            p = Pool(12)

            p.map(f, 60*[c] )
            p.map(g, 60*[c] )

            p.close()
            p.join()