コード例 #1
0
 def test_semantic_similarity_for_alignment(self):
     from oke.oak.util import extract_type_label
     from oke.oak.util import get_URI_fragmentIdentifier
     
     entityClasses_labels=["company"]
     all_rdf_types_labels=["Manufacturer", "Companies Listed On The Singapore Exchange"]
     most_similiar_dul_classes = self.semantic_similarity_for_alignment(entityClasses_labels, all_rdf_types_labels)
     print("most_semantic_similiar_dul_classes for 'company': ",most_similiar_dul_classes)
     
     entity_dbpedia_URI="http://dbpedia.org/resource/Brian_Banner"
     entityClasses_labels=["Fictional Villain","Villain"]
     linked_data_suggested_dulClasses, all_rdf_types =self.linked_data_discovery_for_alignment(entity_dbpedia_URI, entityClasses_labels)
     all_rdf_types_labels=set([extract_type_label(get_URI_fragmentIdentifier(rdftype_uri)) for rdftype_uri in all_rdf_types])
     most_similiar_dul_classes = self.semantic_similarity_for_alignment(entityClasses_labels, all_rdf_types_labels)
     print("most_semantic_similiar_dul_classes for 'Villain': ",most_similiar_dul_classes)
     
     entity_dbpedia_URI="http://dbpedia.org/resource/Dalavia_Far_East_Airways"
     entityClasses_labels=["Airline"]
     linked_data_suggested_dulClasses, all_rdf_types =self.linked_data_discovery_for_alignment(entity_dbpedia_URI, entityClasses_labels)
     all_rdf_types_labels=set([extract_type_label(get_URI_fragmentIdentifier(rdftype_uri)) for rdftype_uri in all_rdf_types])
     most_similiar_dul_classes = self.semantic_similarity_for_alignment(entityClasses_labels, all_rdf_types_labels)
     print("most_semantic_similiar_dul_classes for 'Airline': ",most_similiar_dul_classes)
      
     entity_dbpedia_URI="http://dbpedia.org/resource/Danderyds_sjukhus_metro_station"
     entityClasses_labels=["Station","Metro Station"]
     linked_data_suggested_dulClasses, all_rdf_types =self.linked_data_discovery_for_alignment(entity_dbpedia_URI, entityClasses_labels)
     all_rdf_types_labels=set([extract_type_label(get_URI_fragmentIdentifier(rdftype_uri)) for rdftype_uri in all_rdf_types])
     most_similiar_dul_classes = self.semantic_similarity_for_alignment(entityClasses_labels, all_rdf_types_labels)
     print("most_semantic_similiar_dul_classes for 'Metro Station': ",most_similiar_dul_classes)
コード例 #2
0
    def test_semantic_similarity_for_alignment(self):
        from oke.oak.util import extract_type_label
        from oke.oak.util import get_URI_fragmentIdentifier

        entityClasses_labels = ["company"]
        all_rdf_types_labels = [
            "Manufacturer", "Companies Listed On The Singapore Exchange"
        ]
        most_similiar_dul_classes = self.semantic_similarity_for_alignment(
            entityClasses_labels, all_rdf_types_labels)
        print("most_semantic_similiar_dul_classes for 'company': ",
              most_similiar_dul_classes)

        entity_dbpedia_URI = "http://dbpedia.org/resource/Brian_Banner"
        entityClasses_labels = ["Fictional Villain", "Villain"]
        linked_data_suggested_dulClasses, all_rdf_types = self.linked_data_discovery_for_alignment(
            entity_dbpedia_URI, entityClasses_labels)
        all_rdf_types_labels = set([
            extract_type_label(get_URI_fragmentIdentifier(rdftype_uri))
            for rdftype_uri in all_rdf_types
        ])
        most_similiar_dul_classes = self.semantic_similarity_for_alignment(
            entityClasses_labels, all_rdf_types_labels)
        print("most_semantic_similiar_dul_classes for 'Villain': ",
              most_similiar_dul_classes)

        entity_dbpedia_URI = "http://dbpedia.org/resource/Dalavia_Far_East_Airways"
        entityClasses_labels = ["Airline"]
        linked_data_suggested_dulClasses, all_rdf_types = self.linked_data_discovery_for_alignment(
            entity_dbpedia_URI, entityClasses_labels)
        all_rdf_types_labels = set([
            extract_type_label(get_URI_fragmentIdentifier(rdftype_uri))
            for rdftype_uri in all_rdf_types
        ])
        most_similiar_dul_classes = self.semantic_similarity_for_alignment(
            entityClasses_labels, all_rdf_types_labels)
        print("most_semantic_similiar_dul_classes for 'Airline': ",
              most_similiar_dul_classes)

        entity_dbpedia_URI = "http://dbpedia.org/resource/Danderyds_sjukhus_metro_station"
        entityClasses_labels = ["Station", "Metro Station"]
        linked_data_suggested_dulClasses, all_rdf_types = self.linked_data_discovery_for_alignment(
            entity_dbpedia_URI, entityClasses_labels)
        all_rdf_types_labels = set([
            extract_type_label(get_URI_fragmentIdentifier(rdftype_uri))
            for rdftype_uri in all_rdf_types
        ])
        most_similiar_dul_classes = self.semantic_similarity_for_alignment(
            entityClasses_labels, all_rdf_types_labels)
        print("most_semantic_similiar_dul_classes for 'Metro Station': ",
              most_similiar_dul_classes)
コード例 #3
0
    def ontology_alignment(self, context_data):
        '''
        ontology alignment for each context data
        step 1: linked data discovery
        step 2: terminological similarity alignment
        step 3: semantic similarity alignment
        
        return set, suggested aligned classes
        '''
        from oke.oak.util import extract_type_label
        from oke.oak.util import get_URI_fragmentIdentifier

        entity_dbpedia_URI = context_data.entity.taIdentRef
        entityClasses = context_data.entity.isInstOfEntityClasses

        entityClasses_labels = set(
            [entityClass.anchorOf for entityClass in entityClasses])
        #step 1: linked data discovery for alignment suggestions
        linked_data_suggested_alignments, all_rdf_types = self.linked_data_discovery_for_alignment(
            entity_dbpedia_URI, entityClasses_labels)
        '''
        if (len(linked_data_suggested_alignments) > 0):
            return linked_data_suggested_alignments
        '''
        all_rdf_types_labels = set([
            extract_type_label(get_URI_fragmentIdentifier(rdftype_uri))
            for rdftype_uri in all_rdf_types
        ])

        #step 2: terminological similarity computation for alignment suggestions
        term_similarity_suggested_alignments = self.terminology_similarity_for_alignment(
            entityClasses_labels, all_rdf_types_labels)
        if len(term_similarity_suggested_alignments) > 0:
            return term_similarity_suggested_alignments

        #step 3: semantic similarity computation for alignment suggestion
        #semantic_similarity_suggestions=set()

        semantic_similarity_suggested_DUL_class = self.semantic_similarity_for_alignment(
            entityClasses_labels, all_rdf_types_labels)

        #semantic_similarity_suggestions.add(semantic_similarity_suggested_DUL_class)
        semantic_similarity_suggestions = {
            semantic_similarity_suggested_DUL_class
        }
        print("return suggested DUL alignment from semantic computation:",
              semantic_similarity_suggestions)

        return semantic_similarity_suggestions
        '''
コード例 #4
0
 def ontology_alignment(self, context_data):
     '''
     ontology alignment for each context data
     step 1: linked data discovery
     step 2: terminological similarity alignment
     step 3: semantic similarity alignment
     
     return set, suggested aligned classes
     '''
     from oke.oak.util import extract_type_label
     from oke.oak.util import get_URI_fragmentIdentifier
     
     entity_dbpedia_URI = context_data.entity.taIdentRef
     entityClasses = context_data.entity.isInstOfEntityClasses
     
     entityClasses_labels=set([entityClass.anchorOf for entityClass in entityClasses])
     #step 1: linked data discovery for alignment suggestions
     linked_data_suggested_alignments, all_rdf_types =self.linked_data_discovery_for_alignment(entity_dbpedia_URI,entityClasses_labels)
     '''
     if (len(linked_data_suggested_alignments) > 0):
         return linked_data_suggested_alignments
     '''
     all_rdf_types_labels=set([extract_type_label(get_URI_fragmentIdentifier(rdftype_uri)) for rdftype_uri in all_rdf_types])
     
     #step 2: terminological similarity computation for alignment suggestions
     term_similarity_suggested_alignments= self.terminology_similarity_for_alignment(entityClasses_labels, all_rdf_types_labels)
     if len(term_similarity_suggested_alignments) > 0:
         return term_similarity_suggested_alignments
     
     #step 3: semantic similarity computation for alignment suggestion
     #semantic_similarity_suggestions=set()
     
     semantic_similarity_suggested_DUL_class = self.semantic_similarity_for_alignment(entityClasses_labels,all_rdf_types_labels)
     
     #semantic_similarity_suggestions.add(semantic_similarity_suggested_DUL_class)
     semantic_similarity_suggestions={semantic_similarity_suggested_DUL_class}
     print("return suggested DUL alignment from semantic computation:",semantic_similarity_suggestions)
     
     return semantic_similarity_suggestions
     '''
コード例 #5
0
    def batch_ontology_alignment(self):
        '''
        ontology alignment for DOLCE+DnS Ultra Lite classes
            : query for dbpedia rdf types -> wordnet path similarity (is-a taxonomy) matching
        '''
        from oke.oak.FeatureFactory import FeatureFactory
        from oke.oak.util import extract_type_label
        import collections

        featureFactory = FeatureFactory()

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        contextDict = self.dataProcessor.get_task_context(
            self.dataProcessor.graphData_goldstandards)
        entityset = set()
        dulclassset = set()
        without_duclass_num = 0

        true_positive = 0
        false_positive = 0
        true_negative = 0
        false_negative = 0

        for context, context_sent in contextDict.items():
            context_data = featureFactory.dataProcessor.aggregate_context_data(
                featureFactory.dataProcessor.graphData_goldstandards, context,
                context_sent)

            entity_dbpedia_URI = context_data.entity.taIdentRef
            entityClasses = context_data.entity.isInstOfEntityClasses

            labelled_class_type = [
                entityClass.subClassOf for entityClass in entityClasses
            ]
            print('labelled class type:', labelled_class_type)

            entity_class_labels = set(
                [entityClass.anchorOf for entityClass in entityClasses])

            entity_rdftypes = featureFactory.dbpedia_query_rdftypes(
                entity_dbpedia_URI)

            class_inst_rdftypes = featureFactory.dbpedia_query_deferencing_type(
                entity_class_labels)
            '''step 1: Linked Open Data Discovering: check if there is dul/d0 class already associated with entity and type (by dereferenceable URI)
            '''
            #http://www.ontologydesignpatterns.org/ont/d0.owl#Location
            entity_rdf_type_labels = set([
                extract_type_label(
                    featureFactory.get_URI_fragmentIdentifier(rdftype_uri))
                for rdftype_uri in entity_rdftypes
            ])
            #TODO: entity_class_rdf_type_labels

            # step 1: check whether there exist dul class already classified in DBpedia
            dulClass = [
                rdftype for rdftype in entity_rdftypes
                if self.is_dul_class(rdftype)
            ]

            entityset.add(context_data.entity.taIdentRef)
            testset = set()
            if len(dulClass) > 0 and dulClass[
                    0] in featureFactory.dul_ontology_classes.keys():
                dulclassset.add(dulClass[0])
                testset.add(dulClass[0])
            else:
                #'<',entity_dbpedia_URI,
                without_duclass_num += 1
                print(
                    str(without_duclass_num) +
                    '> do not have dul class pre-classified in DBpedia')

                entity_synset = set()
                entity_synset.update(entity_rdf_type_labels)
                entity_synset.update(entity_class_labels)

                aligned_type = self.schema_alignment_by_wordnet(
                    entity_synset, featureFactory.dul_ontology_classes)
                print("string similarity aligned type for [",
                      entity_class_labels, '] is [', aligned_type, ']')
                dulclassset.add(aligned_type)
                testset.add(aligned_type)

            print("labelled class type:", labelled_class_type)
            print("predicted class type:", testset)
            if (len(testset) > 0 and len(labelled_class_type) == 0):
                false_positive += 1
            elif (list(testset)[0] == list(labelled_class_type)[0]):
                true_positive += 1
            else:
                false_positive += 1

        print('precision:', true_positive / (true_positive + false_positive))
        print('entityset size:', len(entityset))
        print('existing dul class size:', len(dulclassset))
コード例 #6
0
    def compute_features(self, context_data):
        '''
        Maximum entropy model gives a better performance for sequence labelling problem. 
        By maximizing the entropy in our model, we are attempting to minimise the amount of the information the model carries.
        Design a language model to maximise the entropy and 
            feed our language model with a set of features associated with a given token we wish to classify
            and the system can then given us the probability that our token falls into any given class of token against which our language model was trained.
        '''
        from oke.oak.util import wordnet_shortest_path
        from oke.oak.util import extract_type_label
        from oke.oak.util import get_URI_fragmentIdentifier
        from oke.oak.util import contains_digits
        
        #words, contextURI, previousLabel, position
        if type(context_data) is not TaskContext:
            raise Exception('Type error: context_data must be the instance of oke.oak.TaskContext')
        
        context_words=word_tokenize(context_data.isString)
        tagged_context=pos_tag(context_words)
        sem_tagged_context=self.sem_tag(context_words,context_data)        
        
        entity_name=context_data.entity.anchorOf
        entity_head_word=entity_name.split(' ')[-1:][0]
        entity_dbpedia_URI = context_data.entity.taIdentRef
        #print("entity_dbpedia_URI:"+entity_dbpedia_URI)
        '''
        LOD based semantic type feature:
        '''
        entity_rdftypes=self.entity_rdftypes_feature_extraction(entity_dbpedia_URI)         
        
        if (len (entity_rdftypes) == 0):
            print("Warn: No rdf types can be found for [current word")#entity_name.decode("utf8"),"]")
        # extract labels from RDF type
        entity_semantics=set()
        entity_semantics.update(set([extract_type_label(get_URI_fragmentIdentifier(rdftype_uri)) for rdftype_uri in entity_rdftypes]))
        
        #print('sem_tagged_context:',sem_tagged_context)
        #add head word into rdf type
        #  to avoid adding head word into rdf type: not many head word represent essential word associated with type
        #entity_semantics.add(entity_head_word)
        #print("entity_semantics:",entity_semantics)
        datums=[]
        
        #compute features for each word
        #use sliding window to observe on both left and right hand side
        currentIndex=0
        sliding_window_prev_n_words=8
        sliding_window_next_n_words=3
        
        for tagged_word in tagged_context:
            currentWord=tagged_word[0]
            #label encoding
            currentWord_label='O' if sem_tagged_context[currentIndex][1] !='class' else 'class'
            datum = Datum(context_data.contextURI,currentWord,currentWord_label)
            
            datum.previousLabel=datums[currentIndex-1].label if (currentIndex-1) in range(0,len(datums)) else 'None'
            
            features={}
            #word-level features (part-of-speech, case, punctuation,digit,morphology)
            import string
            if currentWord.lower() not in self.stoplist and currentWord not in string.punctuation and currentWord.isdigit() is not True and tagged_word[1] in ["NN", "NNP", "NNS"]:
                #use lemmatised word
                features["word"]= self.wordnet_lemmatizer.lemmatize(currentWord, pos='n')
                #Word sense of Noun: we can use "WN_CLASS" to determine whether the NN word is a hyponym of w (or keywords) in ontology by wordnet
                #features["WN_CLASS"]=
            features["word_pos"]=tagged_word[1]
            #features["word_root"]=self.wordnet_lemmatizer.lemmatize(currentWord, pos='n')
            features["is_title"]=str(currentWord).istitle()
            features['all_capital']=currentWord.isupper()
            features["is_word_root_be"]='Y' if self.wordnet_lemmatizer.lemmatize(currentWord, pos='v') == 'be' else 'N'
            features['is_punct_comma']='Y' if str(currentWord) == ',' else 'N'
            features['word_with_digits']='Y' if tagged_word[1]!='CD' and contains_digits(str(currentWord)) else 'N'         
            features["is_StopWord"]='Y' if currentWord in self.stoplist else 'N'
            features["is_Entity"]='N' if sem_tagged_context[currentIndex] !='entity' else 'Y'
            features["last_2_letters"]='None' if len(str(currentWord))<=2 or str(currentWord).isdigit() else str(currentWord)[-2:]
            #type_indicator can be retrieved by wordnet synonyms
            features["type_indicator"]='Y' if currentWord in ['name','form','type','class','category', 'variety', 'style','model','substance', 'version', 'genre','matter','mound', 'kind', 'shade', 'substance'] else 'N'
            
            #semantic (gazetteer lookup) features
            features["is_orgKey"] ='Y' if currentWord.lower() in self.gaz_org_key else 'N'
            features["is_locKey"] = 'Y' if currentWord.lower() in self.gaz_loc_key else 'N'
            features["is_country"] = 'Y' if currentWord.lower() in self.gaz_country else 'N'
            features["is_countryAdj"]='Y' if currentWord.lower in self.gaz_countryAdj else 'N'
            features["is_personName"] = 'Y' if currentWord.lower() in self.gaz_person_name else 'N'
            features["is_personTitle"] = 'Y' if currentWord.lower() in self.gaz_person_title else 'N'
            features['is_jobtitle']='Y' if currentWord.lower() in self.gaz_job_title else 'N'
            features['is_facKey']='Y' if currentWord.lower() in self.gaz_facility_key else 'N'
            
            #add feature to compute path similarity between dbpedia type and current word
            
            if entity_semantics:
                max_sim = max([wordnet_shortest_path(currentWord,sem_type.split(' ')[-1:][0]) for sem_type in entity_semantics])
                features['sim_dist_with_DbpediaType'] = max_sim
            
            
            for last_i in range(1,sliding_window_prev_n_words+1):
                if currentIndex == 0:
                    features['prev_word']="<START>"
                
                if currentIndex != 0 and currentIndex-last_i >=0:                    
                    #features['prev_'+str(last_i)+'_word']=datums[currentIndex-last_i].features['word'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_pos']=datums[currentIndex-last_i].features['word_pos'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    #features['prev_'+str(last_i)+'_word_root']=datums[currentIndex-last_i].features['word_root'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'                
                    features['prev_'+str(last_i)+'_word_is_StopWord']=datums[currentIndex-last_i].features['is_StopWord'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'         
                    features['prev_'+str(last_i)+'_word_is_Entity']=datums[currentIndex-last_i].features['is_Entity'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_title']=datums[currentIndex-last_i].features['is_title'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_all_capital']=datums[currentIndex-last_i].features['all_capital'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_word_root_be']=datums[currentIndex-last_i].features['is_word_root_be'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_punct_comma']=datums[currentIndex-last_i].features['is_punct_comma'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_word_with_digits']=datums[currentIndex-last_i].features['word_with_digits'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_last_2_letters']=datums[currentIndex-last_i].features['last_2_letters'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_type_indicator']=datums[currentIndex-last_i].features['type_indicator'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_orgKey']=datums[currentIndex-last_i].features['is_orgKey'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_locKey']=datums[currentIndex-last_i].features['is_locKey'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_country']=datums[currentIndex-last_i].features['is_country'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_countryAdj']=datums[currentIndex-last_i].features['is_countryAdj'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_personName']=datums[currentIndex-last_i].features['is_personName'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_personTitle']=datums[currentIndex-last_i].features['is_personTitle'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_'+str(last_i)+'_word_is_facKey']=datums[currentIndex-last_i].features['is_facKey'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                
            datum.features=features
            currentIndex+=1
            datums.append(datum)
        
        #add features about next words
        #reset to 0
        currentIndex = 0
        for tagged_word in tagged_context:
            for next_i in range(1, sliding_window_next_n_words+1):
                if ((currentIndex+next_i) == len(datums)):
                    datums[currentIndex].features['next_word']="<END>"
                
                if (currentIndex+next_i) != len(datums) :
                    #datums[currentIndex].features['next_'+str(next_i)+'_word']=datums[currentIndex+next_i].features['word'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_pos']=datums[currentIndex+next_i].features['word_pos'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_StopWord']=datums[currentIndex+next_i].features['is_StopWord'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_Entity']=datums[currentIndex+next_i].features['is_Entity'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'

                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_title']=datums[currentIndex+next_i].features['is_title'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_all_capital']=datums[currentIndex+next_i].features['all_capital'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_word_root_be']=datums[currentIndex+next_i].features['is_word_root_be'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_punct_comma']=datums[currentIndex+next_i].features['is_punct_comma'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_word_with_digits']=datums[currentIndex+next_i].features['word_with_digits'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_last_2_letters']=datums[currentIndex+next_i].features['last_2_letters'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_type_indicator']=datums[currentIndex+next_i].features['type_indicator'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_orgKey']=datums[currentIndex+next_i].features['is_orgKey'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_locKey']=datums[currentIndex+next_i].features['is_locKey'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_country']=datums[currentIndex+next_i].features['is_country'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_countryAdj']=datums[currentIndex+next_i].features['is_countryAdj'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_personName']=datums[currentIndex+next_i].features['is_personName'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_personTitle']=datums[currentIndex+next_i].features['is_personTitle'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features['next_'+str(next_i)+'_word_is_facKey']=datums[currentIndex+next_i].features['is_facKey'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
            currentIndex+=1
            
        return datums
コード例 #7
0
 def batch_ontology_alignment(self):
     '''
     ontology alignment for DOLCE+DnS Ultra Lite classes
         : query for dbpedia rdf types -> wordnet path similarity (is-a taxonomy) matching
     '''
     from oke.oak.FeatureFactory import FeatureFactory
     from oke.oak.util import extract_type_label
     import collections
     
     featureFactory = FeatureFactory()
     
     refsets = collections.defaultdict(set)
     testsets = collections.defaultdict(set)
     
     contextDict = self.dataProcessor.get_task_context(self.dataProcessor.graphData_goldstandards)
     entityset=set()
     dulclassset=set()
     without_duclass_num=0
     
     true_positive=0
     false_positive=0
     true_negative=0
     false_negative=0
     
     for context, context_sent in contextDict.items():
         context_data=featureFactory.dataProcessor.aggregate_context_data(featureFactory.dataProcessor.graphData_goldstandards,context,context_sent)
         
         entity_dbpedia_URI = context_data.entity.taIdentRef
         entityClasses = context_data.entity.isInstOfEntityClasses
         
         labelled_class_type = [entityClass.subClassOf for entityClass in entityClasses]
         print('labelled class type:',labelled_class_type)
         
         entity_class_labels=set([entityClass.anchorOf for entityClass in entityClasses])
         
         entity_rdftypes = featureFactory.dbpedia_query_rdftypes(entity_dbpedia_URI)
         
         class_inst_rdftypes=featureFactory.dbpedia_query_deferencing_type(entity_class_labels)
         
         '''step 1: Linked Open Data Discovering: check if there is dul/d0 class already associated with entity and type (by dereferenceable URI)
         '''
         #http://www.ontologydesignpatterns.org/ont/d0.owl#Location
         entity_rdf_type_labels=set([extract_type_label(featureFactory.get_URI_fragmentIdentifier(rdftype_uri)) for rdftype_uri in entity_rdftypes])
         #TODO: entity_class_rdf_type_labels
         
         # step 1: check whether there exist dul class already classified in DBpedia
         dulClass=[rdftype for rdftype in entity_rdftypes if self.is_dul_class(rdftype)]
         
         entityset.add(context_data.entity.taIdentRef)
         testset=set()
         if len(dulClass) > 0 and dulClass[0] in featureFactory.dul_ontology_classes.keys():
             dulclassset.add(dulClass[0])
             testset.add(dulClass[0])
         else:
             #'<',entity_dbpedia_URI, 
             without_duclass_num+=1
             print(str(without_duclass_num)+'> do not have dul class pre-classified in DBpedia')
             
             entity_synset=set()
             entity_synset.update(entity_rdf_type_labels)
             entity_synset.update(entity_class_labels)
             
             aligned_type = self.schema_alignment_by_wordnet(entity_synset,featureFactory.dul_ontology_classes)
             print("string similarity aligned type for [",entity_class_labels,'] is [',aligned_type,']')
             dulclassset.add(aligned_type)
             testset.add(aligned_type)            
             
         print("labelled class type:",labelled_class_type)
         print("predicted class type:",testset)
         if (len(testset) > 0 and len(labelled_class_type) == 0):
             false_positive+=1
         elif (list(testset)[0] == list(labelled_class_type)[0]):
             true_positive+=1
         else:
             false_positive+=1
     
     print('precision:', true_positive/(true_positive+false_positive))
     print('entityset size:', len(entityset))
     print('existing dul class size:', len(dulclassset))                                
コード例 #8
0
    def compute_features(self, context_data):
        '''
        Maximum entropy model gives a better performance for sequence labelling problem. 
        By maximizing the entropy in our model, we are attempting to minimise the amount of the information the model carries.
        Design a language model to maximise the entropy and 
            feed our language model with a set of features associated with a given token we wish to classify
            and the system can then given us the probability that our token falls into any given class of token against which our language model was trained.
        '''
        from oke.oak.util import wordnet_shortest_path
        from oke.oak.util import extract_type_label
        from oke.oak.util import get_URI_fragmentIdentifier
        from oke.oak.util import contains_digits

        #words, contextURI, previousLabel, position
        if type(context_data) is not TaskContext:
            raise Exception(
                'Type error: context_data must be the instance of oke.oak.TaskContext'
            )

        context_words = word_tokenize(context_data.isString)
        tagged_context = pos_tag(context_words)
        sem_tagged_context = self.sem_tag(context_words, context_data)

        entity_name = context_data.entity.anchorOf
        entity_head_word = entity_name.split(' ')[-1:][0]
        entity_dbpedia_URI = context_data.entity.taIdentRef
        #print("entity_dbpedia_URI:"+entity_dbpedia_URI)
        '''
        LOD based semantic type feature:
        '''
        entity_rdftypes = self.entity_rdftypes_feature_extraction(
            entity_dbpedia_URI)

        if (len(entity_rdftypes) == 0):
            print("Warn: No rdf types can be found for [current word"
                  )  #entity_name.decode("utf8"),"]")
        # extract labels from RDF type
        entity_semantics = set()
        entity_semantics.update(
            set([
                extract_type_label(get_URI_fragmentIdentifier(rdftype_uri))
                for rdftype_uri in entity_rdftypes
            ]))

        #print('sem_tagged_context:',sem_tagged_context)
        #add head word into rdf type
        #  to avoid adding head word into rdf type: not many head word represent essential word associated with type
        #entity_semantics.add(entity_head_word)
        #print("entity_semantics:",entity_semantics)
        datums = []

        #compute features for each word
        #use sliding window to observe on both left and right hand side
        currentIndex = 0
        sliding_window_prev_n_words = 8
        sliding_window_next_n_words = 3

        for tagged_word in tagged_context:
            currentWord = tagged_word[0]
            #label encoding
            currentWord_label = 'O' if sem_tagged_context[currentIndex][
                1] != 'class' else 'class'
            datum = Datum(context_data.contextURI, currentWord,
                          currentWord_label)

            datum.previousLabel = datums[currentIndex - 1].label if (
                currentIndex - 1) in range(0, len(datums)) else 'None'

            features = {}
            #word-level features (part-of-speech, case, punctuation,digit,morphology)
            import string
            if currentWord.lower(
            ) not in self.stoplist and currentWord not in string.punctuation and currentWord.isdigit(
            ) is not True and tagged_word[1] in ["NN", "NNP", "NNS"]:
                #use lemmatised word
                features["word"] = self.wordnet_lemmatizer.lemmatize(
                    currentWord, pos='n')
                #Word sense of Noun: we can use "WN_CLASS" to determine whether the NN word is a hyponym of w (or keywords) in ontology by wordnet
                #features["WN_CLASS"]=
            features["word_pos"] = tagged_word[1]
            #features["word_root"]=self.wordnet_lemmatizer.lemmatize(currentWord, pos='n')
            features["is_title"] = str(currentWord).istitle()
            features['all_capital'] = currentWord.isupper()
            features[
                "is_word_root_be"] = 'Y' if self.wordnet_lemmatizer.lemmatize(
                    currentWord, pos='v') == 'be' else 'N'
            features['is_punct_comma'] = 'Y' if str(
                currentWord) == ',' else 'N'
            features['word_with_digits'] = 'Y' if tagged_word[
                1] != 'CD' and contains_digits(str(currentWord)) else 'N'
            features[
                "is_StopWord"] = 'Y' if currentWord in self.stoplist else 'N'
            features["is_Entity"] = 'N' if sem_tagged_context[
                currentIndex] != 'entity' else 'Y'
            features["last_2_letters"] = 'None' if len(
                str(currentWord)) <= 2 or str(currentWord).isdigit() else str(
                    currentWord)[-2:]
            #type_indicator can be retrieved by wordnet synonyms
            features["type_indicator"] = 'Y' if currentWord in [
                'name', 'form', 'type', 'class', 'category', 'variety',
                'style', 'model', 'substance', 'version', 'genre', 'matter',
                'mound', 'kind', 'shade', 'substance'
            ] else 'N'

            #semantic (gazetteer lookup) features
            features["is_orgKey"] = 'Y' if currentWord.lower(
            ) in self.gaz_org_key else 'N'
            features["is_locKey"] = 'Y' if currentWord.lower(
            ) in self.gaz_loc_key else 'N'
            features["is_country"] = 'Y' if currentWord.lower(
            ) in self.gaz_country else 'N'
            features[
                "is_countryAdj"] = 'Y' if currentWord.lower in self.gaz_countryAdj else 'N'
            features["is_personName"] = 'Y' if currentWord.lower(
            ) in self.gaz_person_name else 'N'
            features["is_personTitle"] = 'Y' if currentWord.lower(
            ) in self.gaz_person_title else 'N'
            features['is_jobtitle'] = 'Y' if currentWord.lower(
            ) in self.gaz_job_title else 'N'
            features['is_facKey'] = 'Y' if currentWord.lower(
            ) in self.gaz_facility_key else 'N'

            #add feature to compute path similarity between dbpedia type and current word

            if entity_semantics:
                max_sim = max([
                    wordnet_shortest_path(currentWord,
                                          sem_type.split(' ')[-1:][0])
                    for sem_type in entity_semantics
                ])
                features['sim_dist_with_DbpediaType'] = max_sim

            for last_i in range(1, sliding_window_prev_n_words + 1):
                if currentIndex == 0:
                    features['prev_word'] = "<START>"

                if currentIndex != 0 and currentIndex - last_i >= 0:
                    #features['prev_'+str(last_i)+'_word']=datums[currentIndex-last_i].features['word'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_' + str(last_i) + '_word_pos'] = datums[
                        currentIndex - last_i].features['word_pos'] if (
                            currentIndex -
                            last_i) in range(0, len(datums)) else 'None'
                    #features['prev_'+str(last_i)+'_word_root']=datums[currentIndex-last_i].features['word_root'] if (currentIndex-last_i) in range(0,len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_StopWord'] = datums[
                                 currentIndex -
                                 last_i].features['is_StopWord'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_Entity'] = datums[
                                 currentIndex -
                                 last_i].features['is_Entity'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_title'] = datums[
                                 currentIndex -
                                 last_i].features['is_title'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_all_capital'] = datums[
                                 currentIndex -
                                 last_i].features['all_capital'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_word_root_be'] = datums[
                                 currentIndex -
                                 last_i].features['is_word_root_be'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_punct_comma'] = datums[
                                 currentIndex -
                                 last_i].features['is_punct_comma'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_word_with_digits'] = datums[
                                 currentIndex -
                                 last_i].features['word_with_digits'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_last_2_letters'] = datums[
                                 currentIndex -
                                 last_i].features['last_2_letters'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_type_indicator'] = datums[
                                 currentIndex -
                                 last_i].features['type_indicator'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_orgKey'] = datums[
                                 currentIndex -
                                 last_i].features['is_orgKey'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_locKey'] = datums[
                                 currentIndex -
                                 last_i].features['is_locKey'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_country'] = datums[
                                 currentIndex -
                                 last_i].features['is_country'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_countryAdj'] = datums[
                                 currentIndex -
                                 last_i].features['is_countryAdj'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_personName'] = datums[
                                 currentIndex -
                                 last_i].features['is_personName'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_personTitle'] = datums[
                                 currentIndex -
                                 last_i].features['is_personTitle'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'
                    features['prev_' + str(last_i) +
                             '_word_is_facKey'] = datums[
                                 currentIndex -
                                 last_i].features['is_facKey'] if (
                                     currentIndex - last_i) in range(
                                         0, len(datums)) else 'None'

            datum.features = features
            currentIndex += 1
            datums.append(datum)

        #add features about next words
        #reset to 0
        currentIndex = 0
        for tagged_word in tagged_context:
            for next_i in range(1, sliding_window_next_n_words + 1):
                if ((currentIndex + next_i) == len(datums)):
                    datums[currentIndex].features['next_word'] = "<END>"

                if (currentIndex + next_i) != len(datums):
                    #datums[currentIndex].features['next_'+str(next_i)+'_word']=datums[currentIndex+next_i].features['word'] if (currentIndex+next_i) in range(0,len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_pos'] = datums[currentIndex +
                                              next_i].features['word_pos'] if (
                                                  currentIndex +
                                                  next_i) in range(
                                                      0,
                                                      len(datums)) else 'None'

                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_StopWord'] = datums[
                            currentIndex + next_i].features['is_StopWord'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_Entity'] = datums[
                            currentIndex + next_i].features['is_Entity'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'

                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_title'] = datums[
                            currentIndex + next_i].features['is_title'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_all_capital'] = datums[
                            currentIndex + next_i].features['all_capital'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_is_word_root_be'] = datums[
                            currentIndex +
                            next_i].features['is_word_root_be'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_is_punct_comma'] = datums[
                            currentIndex +
                            next_i].features['is_punct_comma'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_word_with_digits'] = datums[
                            currentIndex +
                            next_i].features['word_with_digits'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_last_2_letters'] = datums[
                            currentIndex +
                            next_i].features['last_2_letters'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_type_indicator'] = datums[
                            currentIndex +
                            next_i].features['type_indicator'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_orgKey'] = datums[
                            currentIndex + next_i].features['is_orgKey'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_locKey'] = datums[
                            currentIndex + next_i].features['is_locKey'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_country'] = datums[
                            currentIndex + next_i].features['is_country'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_is_countryAdj'] = datums[
                            currentIndex +
                            next_i].features['is_countryAdj'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_is_personName'] = datums[
                            currentIndex +
                            next_i].features['is_personName'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) +
                        '_word_is_personTitle'] = datums[
                            currentIndex +
                            next_i].features['is_personTitle'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
                    datums[currentIndex].features[
                        'next_' + str(next_i) + '_word_is_facKey'] = datums[
                            currentIndex + next_i].features['is_facKey'] if (
                                currentIndex +
                                next_i) in range(0, len(datums)) else 'None'
            currentIndex += 1

        return datums