コード例 #1
0
def evalRelationTypes(fname, fGoldStd, methodLabel, method ):
    """ evaluates the given ontology and writes the results into a file 
    @param[in] fname        file name of the ontology to evaluate
    @param[in] fGoldStd     file name of the gold standard ontology
    @param[in] methodLabel  label of the method used in the evaluation
    @param[in] method       method used in the evaluator
    """
    goldStd  = _readOntology( fGoldStd )
    ontology = _readOntology( fname )

    goldStdConcepts  = set(map(str, extractRelationSet(goldStd)))
    ontologyConcepts = set(map(str, extractRelationSet(ontology)))

    log.info("Comparing the relation set %s to the gold standard %s." % (ontologyConcepts, goldStdConcepts))

    res = [ 1 ]
    for scoringMethod in (EqualRel, EqualGroup, SimilarGroup):
        __cache__ = DiskCache(".diskCache-%s-%s" % (scoringMethod.__name__, os.path.basename(fGoldStd)) )
        c = ConceptScoring(ontologyConcepts, goldStdConcepts, scoringMethod, '|')
        key = "%s, %s |" % (ontologyConcepts, goldStdConcepts)
        score = __cache__.fetchObjectId(key, c.score)
        res.append(score)
        # compute precision and recall
        p = float(score) / len(ontologyConcepts)
        r = float(score) / len(goldStdConcepts)
        if p==0. and r== 0.:
            res.append(0.)
        else:
            res.append( metrics.fMeasure(p,r) )

    #print ">>>", len(goldStdConcepts), len(ontologyConcepts), "***", res
    return res
コード例 #2
0
ファイル: cache_test.py プロジェクト: yaniamac/ewrt
    def testDirectCall(self):
        ''' tests directly calling the cache object using __call__ '''
        CACHE_DIR = get_cache_dir(4)
        cached_str = DiskCache(CACHE_DIR, fn=str)

        assert cached_str(7) == "7"
        assert cached_str.getKey(7) in cached_str
コード例 #3
0
ファイル: cache_test.py プロジェクト: weblyzard/ewrt
 def testDirectCall(self):
     ''' tests directly calling the cache object using __call__ '''
     CACHE_DIR = get_cache_dir(4)
     cached_str = DiskCache(CACHE_DIR, fn=str)
     
     assert cached_str(7) == "7"
     assert cached_str.getKey(7) in cached_str
コード例 #4
0
ファイル: __init__.py プロジェクト: weblyzard/ewrt
 def __init__(self, dataSource, cache=True):
     """ @param[in] dataSource implementing the TagInfoService Interface """
     assert isinstance(dataSource, TagInfoService)
     self.dataSource = dataSource
     if cache == True:
         diskCache = DiskCache("./.coherence-tagcount-cache", 2)
         self.getTagCount = lambda tt: diskCache.fetchObjectId(self.dataSource.__class__.__name__ + str(tt),
                                                               self.dataSource.getTagInfo, tt)
     else:
         self.getTagCount = self.dataSource.getTagInfo
コード例 #5
0
ファイル: __init__.py プロジェクト: weblyzard/ewrt
 def __init__(self, dataSource, cache=True):
     """ @param[in] dataSource implementing the TagInfoService Interface """
     assert isinstance(dataSource, TagInfoService)
     self.dataSource = dataSource
     if cache == True:
         diskCache = DiskCache("./.coherence-tagcount-cache", 2)
         self.getTagCount = lambda tt: diskCache.fetchObjectId(self.dataSource.__class__.__name__ + str(tt),
                                                               self.dataSource.getTagInfo, tt)
     else:
         self.getTagCount = self.dataSource.getTagInfo
コード例 #6
0
ファイル: cache_test.py プロジェクト: weblyzard/ewrt
 def testObjectKeyGeneration(self):
     ''' ensures that the diskcache object's location does not change '''   
     CACHE_DIR = get_cache_dir(3)
     d = DiskCache(CACHE_DIR)
     getCacheLocation = lambda x: join(CACHE_DIR, Cache.getObjectId(x))
     
     d.fetchObjectId(1, str, 1)
     assert exists( getCacheLocation(1) )
     
     d.fetch(str, 2)
     assert exists( getCacheLocation( ((2,), ()) ))
コード例 #7
0
ファイル: async.py プロジェクト: JakobSteixner/ewrt
 def getPostHashfile(self, cmd):
     ''' returns an identifier representing the object which is compatible
         to the identifiers returned by the eWRT.util.cache.* classes. '''
     args = (
         tuple(cmd[1:]), ()
     )  # required to produce the same hash as DiskCache's fetch method
     return self._get_fname(DiskCache.getObjectId(args))
コード例 #8
0
class WebDocumentTerm(TermReference):
    """ @class WebDocumentTerm
        Similarity metric based on the similarity of the documents retrieved
        with a web search.
    """

    yahoo = Yahoo()
    __cache__ = DiskCache(".diskCache-WebDocumentTerm-conceptCache",
                          cache_nesting_level=2)

    @staticmethod
    def _getConceptWebDocuments(concept):
        """ returns web documents describing the given concept 
            @param[in] concept concept used to describe the text
        """
        searchTerms = (concept.name, ) + tuple(
            concept.context_terms)[:CONTEXT_TERM_COUNT]
        log.debug("Searching for %s" % str(searchTerms))
        yq = Yahoo.getSearchResults( \
               WebDocumentTerm.yahoo.query( searchTerms, \
                                            count=WEB_DOCUMENT_COUNT, \
                                            queryParams={'view':'keyterms', 'abstract': 'long', 'type':'html,text'}) )

        p = Pool(WEB_DOCUMENT_COUNT)
        text = "\n".join(p.map(p_getWebDocumentText, yq))

        return cleanup(text)

    @staticmethod
    def _getConceptWebDocumentsVector(concept):
        return VectorSpaceModel(
            WebDocumentTerm._getConceptWebDocuments(concept).split())

    @staticmethod
    @DiskCached(".diskCache-WebDocumentTerm-or")
    def _or(c1, c2):
        """ Compares two concepts and returns their similarity score
            @param[in] c1 the first OntologyConcept
            @param[in] c2 the second OntologyConcept
            @returns the similarity betwen c1 and c2 
        """
        c1Text = WebDocumentTerm.__cache__.fetchObjectId(
            c1, WebDocumentTerm._getConceptWebDocumentsVector, c1)
        c2Text = WebDocumentTerm.__cache__.fetchObjectId(
            c2, WebDocumentTerm._getConceptWebDocumentsVector, c2)

        # similarity for concepts with no matches

        if len(c1Text.v) == 0 or len(c2Text.v) == 0:
            if len(c1Text.v) == 0:
                log.warn("No web pages found for '%s'" % c1)
            if len(c2Text.v) == 0:
                log.warn("No web pages found for '%s'" % c2)
            return 0.

        return c1Text * c2Text

    def __or__(self, o):
        return self._or(self.e, o.e)
コード例 #9
0
def evalOntology( fname, fGoldStd, methodLabel, method ):
    """ evaluates the given ontology and writes the results into a file 
    @param[in] fname        file name of the ontology to evaluate
    @param[in] fGoldStd     file name of the gold standard ontology
    @param[in] methodLabel  label of the method used in the evaluation
    @param[in] method       method used in the evaluator
    """
    
    goldStd  = _readOntology( fGoldStd )
    ontology = _readOntology( fname )

    goldStdConcepts  = OntologyConcept.sequenceToOntologyConceptList(extractConceptSet(goldStd))
    ontologyConcepts = OntologyConcept.sequenceToOntologyConceptList(extractConceptSet(ontology))

    log.info("Comparing the ontology concepts %s to the gold standard %s." % (ontologyConcepts, goldStdConcepts))

    res = [ conceptTermCount( ontology ) ]
    for scoringMethod in (EqualTerm, StringEditTerm, PhoneticTerm, WordNetTerm, WikipediaTerm, WebDocumentTerm, GoogleDistanceTerm, OntologyTerm, ):
        __cache__ = DiskCache(".diskCache-%s-%s" % (scoringMethod.__name__, os.path.basename(fGoldStd)) )
        # Methods using neighbor concepts
        if scoringMethod in (WebDocumentTerm, ):
            goldNeighborConcepts = OntologyConcept.statementsToDirectNeighborOntologyConceptList( extractSPO(goldStd) )
            ontoNeighborConcepts = OntologyConcept.statementsToDirectNeighborOntologyConceptList( extractSPO(ontology) )
            c = ConceptScoring(ontoNeighborConcepts, goldNeighborConcepts, scoringMethod, '|', poolSize=1)
            key = "%s, %s |" % (ontoNeighborConcepts, goldNeighborConcepts)

        # methods using all concepts
        else:
            ps = 1 if scoringMethod == OntologyTerm else 4
            c = ConceptScoring(ontologyConcepts, goldStdConcepts, scoringMethod, '|', poolSize=ps)
            key = "%s, %s |" % (ontologyConcepts, goldStdConcepts)

        score = __cache__.fetchObjectId(key, c.score)
        print scoringMethod, score
        res.append(score)
        # compute precision and recall
        p = float(score) / len(ontologyConcepts)
        r = float(score) / len(goldStdConcepts)
        if p==0. and r== 0.:
            res.append(0.)
        else:
            res.append( metrics.fMeasure(p,r) )

    return res
コード例 #10
0
ファイル: __init__.py プロジェクト: k3njiy/ewrt
 def __init__(self, submitter, api_key=OPENCALAIS_KEY, allow_distro="false", allow_search="false", cache_dir=OPENCALAIS_CACHE_DIR):
     """
     Creates a new handler for communicating with OpenCalais.  
             The parameter 'submitter' must contain a string, identifying your application.  
             'api_key' must contain a string with your OpenCalais API key (get it here: http://developer.opencalais.com/apps/register).  
     The optional parameter 'allow_distro', if set to 'true' gives OpenCalais permission to distribute the metadata extracted from your submissions.  The default value for 'allow_distro' is 'false'.  
     The optional parameter 'allow_search', if set to 'true' tells OpenCalais that future searches can be performed on the extracted metadata.  The default value for 'allow_search' is 'false'.  
     """
     assert(api_key) 
     self.submitter = submitter
     self.allow_distro = "false"
     self.allow_search = "false"
     self.api_key = api_key
     if cache_dir:
         self.cache  = DiskCache(cache_dir, cache_nesting_level=2, cache_file_suffix=".xml")
コード例 #11
0
ファイル: __init__.py プロジェクト: k3njiy/ewrt
class Calais:
    submitter = USER_AGENT % "Calais"
    allow_distro = "false"
    allow_search = "false" 
    api_key = ""

    def __init__(self, submitter, api_key=OPENCALAIS_KEY, allow_distro="false", allow_search="false", cache_dir=OPENCALAIS_CACHE_DIR):
        """
        Creates a new handler for communicating with OpenCalais.  
                The parameter 'submitter' must contain a string, identifying your application.  
                'api_key' must contain a string with your OpenCalais API key (get it here: http://developer.opencalais.com/apps/register).  
        The optional parameter 'allow_distro', if set to 'true' gives OpenCalais permission to distribute the metadata extracted from your submissions.  The default value for 'allow_distro' is 'false'.  
        The optional parameter 'allow_search', if set to 'true' tells OpenCalais that future searches can be performed on the extracted metadata.  The default value for 'allow_search' is 'false'.  
        """
        assert(api_key) 
        self.submitter = submitter
        self.allow_distro = "false"
        self.allow_search = "false"
        self.api_key = api_key
        if cache_dir:
            self.cache  = DiskCache(cache_dir, cache_nesting_level=2, cache_file_suffix=".xml")

    @staticmethod
    def random_id(self):
        """
        Creates a random 10-character ID for your submission.  
        """
        chars = str.letters + str.digits
        return "".join( [ choice(chars) for i in xrange(10) ] )
    

    @staticmethod
    def content_id(text):
        """
        Creates a SHA1 hash of the text of your submission.  
        """
        try:
            import hashlib
            h = hashlib.sha1()
        except ImportError:
            import sha
            h = sha.new()

        h.update(text)
        return h.hexdigest()


    def analyze(self, text, content_type="text/txt"): 
        """ Submits 'text' to OpenCalais for analysis and memorizes the extracted metadata. 
            Set the content-type to 'text/html' if you are submitting HTML data.  
        """
        externalID = self.content_id( text )
        paramsXML = PARAMS_XML % (content_type, self.allow_distro, self.allow_search, externalID, self.submitter) 
        param = urlencode({'licenseID':self.api_key, 'content':text, 'paramsXML':paramsXML}) 
                
        # do not fetch the data again, if a file exists in the cache
        get_calais_data = lambda x: Retrieve(Calais.__name__).open(OPENCALAIS_URL, x).read()

        if self.cache is None:
            xml_data = self.unpack( get_calais_data( param ) )
        else:
            xml_data = self.unpack( self.cache.fetch( get_calais_data, param ) )

        return self.parse( xml_data )


    @staticmethod
    def unpack(calais_data):
        """ extracts calais' xml response from the data send by the calais 
            webservice 
        """
        dom = minidom.parseString(calais_data)
        return """<?xml version="1.0" encoding="utf-8"?>\n""" \
                 + dom.getElementsByTagName("string")[0].firstChild.data

    @staticmethod
    def cleanup_xml(xml_data):
        """ removes comments from xml-data-streams provided by opencalais
            @param[in] xml_data 
            @returns the xml data without any comments
        """
        result = []
        comment = False

        while '<!--' in xml_data:

            xml_data = re.sub('<!--[\s\S]*?-->', '', xml_data)
            if not re.search('<!--', xml_data):
                break
            
        return xml_data


    @staticmethod
    def parse(xml_data): 
        """ parses opencalai's xml output and returns it's dictionary representation """

        things = []

        xml_data = Calais.cleanup_xml(xml_data)

        # f= open("tmp","w"); f.write(xml_data.encode("utf8")); f.close()
        dom = minidom.parseString( xml_data.encode("utf8" ))
        
        for document in dom.getElementsByTagName("CalaisSimpleOutputFormat"):
            for annotations in document.childNodes:
                if not annotations.hasChildNodes():
                    continue
                
                if annotations.nodeName == 'Topics':
                    annotations = annotations.firstChild
                
                nodeName = annotations.nodeName
                nodeAttr = dict(annotations.attributes.items())
        
                nodeAttr.update( {'data': annotations.firstChild.data } )

                things.append( {nodeName: nodeAttr } )

        return things
コード例 #12
0
ファイル: async.py プロジェクト: weblyzard/ewrt
 def getPostHashfile(self, cmd):
     ''' returns an identifier representing the object which is compatible
         to the identifiers returned by the eWRT.util.cache.* classes. '''
     args = (tuple(
         cmd[1:]), ())  # required to produce the same hash as DiskCache's fetch method
     return self._get_fname(DiskCache.getObjectId(args))
コード例 #13
0
ファイル: cache_test.py プロジェクト: weblyzard/ewrt
 def setUp(self):
     self.diskCache = DiskCache(get_cache_dir(4))
コード例 #14
0
ファイル: cache_test.py プロジェクト: weblyzard/ewrt
class SkipTestDiskCached(TestCached):
    @staticmethod
    @DiskCached(get_cache_dir(1))
    def add(a=1, b=2):
        return a+b

    @staticmethod
    @DiskCached(get_cache_dir(2))
    def sub(a, b):
        return a-b 
    
    def setUp(self):
        self.diskCache = DiskCache(get_cache_dir(4))

    def tearDown(self):
        ''' remove the cache directories '''
        for cacheDirNo in range(10):
            if exists(get_cache_dir(cacheDirNo)):
                rmtree(get_cache_dir(cacheDirNo))
        
    def testObjectKeyGeneration(self):
        ''' ensures that the diskcache object's location does not change '''   
        CACHE_DIR = get_cache_dir(3)
        d = DiskCache(CACHE_DIR)
        getCacheLocation = lambda x: join(CACHE_DIR, Cache.getObjectId(x))
        
        d.fetchObjectId(1, str, 1)
        assert exists( getCacheLocation(1) )
        
        d.fetch(str, 2)
        assert exists( getCacheLocation( ((2,), ()) ))

    def testContains(self):
        ''' verifies that 'key' in cache works '''
        # diskcache
        assert self.diskCache.fetchObjectId(1, str, 1 ) == "1"
        
        assert 1 in self.diskCache
        assert 2 not in self.diskCache
        
        # diskcached
        assert self.add(12,14) == 26
        assert self.add.getKey(12,14) in self.add
        assert 9 not in self.add
        
    def testDelItem(self):
        ''' verifies that delitem works '''
        # diskcache
        assert self.diskCache.fetch(str, 2) == "2"
        key = self.diskCache.getKey(2)
        assert key in self.diskCache
        del self.diskCache[key]
        assert key not in self.diskCache

        # diskcached
        assert self.add(12,13) == 25
        key = self.add.getKey(12, 13)
        assert key == ((12, 13), ())
        assert key in self.add
        del self.add[key]
        assert key not in self.add     
        
    def testDirectCall(self):
        ''' tests directly calling the cache object using __call__ '''
        CACHE_DIR = get_cache_dir(4)
        cached_str = DiskCache(CACHE_DIR, fn=str)
        
        assert cached_str(7) == "7"
        assert cached_str.getKey(7) in cached_str

            
    def testIterableCache(self):
        ''' tests the iterable cache '''
        CACHE_DIR = get_cache_dir(5)
        i = IterableCache(CACHE_DIR)

        getTestIterator = lambda x: range(x)

        for iteratorSize in (4, 5, 6):
            cachedIterator = i.fetch( getTestIterator, iteratorSize )
            
            for x,y in zip(cachedIterator, getTestIterator(iteratorSize)):
                assert x == y

    @pytest.mark.slow
    def testThreadSafety(self):
        '''  tests whether everything is thread safe '''

        for a in range(1000):
            c = DiskCache(get_cache_dir(6))
            p = Pool(12)

            p.map(f, 60*[c] )
            p.map(g, 60*[c] )

            p.close()
            p.join()
コード例 #15
0
 def __init__(self, e):
     """ @param[in] lm A list of TermReference metrics to use in the ontology metric """
     TermReference.__init__(self, e)
     self.metrics = dict([(m(e),
                           DiskCache(".diskCache-single-%s" % m.__name__))
                          for m in self.METRICS])