def export_entity( concept, wordlist_configfilename="/etc/opensemanticsearch/ocr/dictionary.txt", appended_words=[], facet_dictionary_is_tempfile=False, commit=True): facet = "tag_ss" if concept.facet: facet = concept.facet.facet altLabels = [] for alternate in Alternate.objects.filter(concept=concept.id): altLabels.append(alternate.altLabel) for hidden in Hidden.objects.filter(concept=concept.id): altLabels.append(hidden.hiddenLabel) entity_manager = Entity_Manager() uri = reverse('thesaurus:detail', args=[concept.pk]) entity_manager.add( id=uri, types=[facet], preferred_label=concept.prefLabel, prefLabels=[concept.prefLabel], labels=altLabels, dictionary=facet, facet_dictionary_is_tempfile=facet_dictionary_is_tempfile) # Append single words of concept labels to wordlist of OCR word dictionary labels = [concept.prefLabel] labels.extend(altLabels) if wordlist_configfilename: wordlist_file = open(wordlist_configfilename, 'a', encoding="UTF-8") for label in labels: words = label.split() for word in words: word = word.strip("(),") if word: if word not in appended_words: appended_words.append(word) appended_words.append(word.upper()) wordlist_file.write(word + "\n") wordlist_file.write(word.upper() + "\n") wordlist_file.close() if commit: # reload changed dictionary matcher dictionaries in schema of entities index urlopen( os.getenv('ONTO_TAGGER_SOLR_ENTITIES_URL', default='http://localhost:8983/solr/') + 'admin/cores?action=RELOAD&core=' + os.getenv('ONTO_TAGGER_SOLR_CORE_ENTITIES', default='opensemanticsearch-entities')) return appended_words
def export_entity( concept, wordlist_configfilename="/etc/opensemanticsearch/ocr/dictionary.txt", appended_words=[]): facet = "tag_ss" if concept.facet: facet = concept.facet.facet altLabels = [] for alternate in Alternate.objects.filter(concept=concept.id): altLabels.append(alternate.altLabel) for hidden in Hidden.objects.filter(concept=concept.id): altLabels.append(hidden.hiddenLabel) entity_manager = Entity_Manager() fields = {} # taxonomy taxonomy = get_taxonomy(concept) if taxonomy: fields['skos_broader_taxonomy_prefLabel_ss'] = taxonomy entity_manager.add(id=concept.pk, types=[facet], preferred_label=concept.prefLabel, prefLabels=[concept.prefLabel], labels=altLabels, fields=fields) # Append single words of concept labels to wordlist of OCR word dictionary labels = [concept.prefLabel] labels.extend(altLabels) if wordlist_configfilename: wordlist_file = open(wordlist_configfilename, 'a', encoding="UTF-8") for label in labels: words = label.split() for word in words: word = word.strip("(),") if word: if word not in appended_words: appended_words.append(word) appended_words.append(word.upper()) wordlist_file.write(word + "\n") wordlist_file.write(word.upper() + "\n") wordlist_file.close() return appended_words
def export_entity( concept, wordlist_configfilename="/etc/opensemanticsearch/ocr/dictionary.txt", appended_words=[]): facet = "tag_ss" if concept.facet: facet = concept.facet.facet altLabels = [] for alternate in Alternate.objects.filter(concept=concept.id): altLabels.append(alternate.altLabel) for hidden in Hidden.objects.filter(concept=concept.id): altLabels.append(hidden.hiddenLabel) entity_manager = Entity_Manager() uri = reverse('thesaurus:detail', args=[concept.pk]) entity_manager.add(id=uri, types=[facet], preferred_label=concept.prefLabel, prefLabels=[concept.prefLabel], labels=altLabels) # Append single words of concept labels to wordlist of OCR word dictionary labels = [concept.prefLabel] labels.extend(altLabels) if wordlist_configfilename: wordlist_file = open(wordlist_configfilename, 'a', encoding="UTF-8") for label in labels: words = label.split() for word in words: word = word.strip("(),") if word: if word not in appended_words: appended_words.append(word) appended_words.append(word.upper()) wordlist_file.write(word + "\n") wordlist_file.write(word.upper() + "\n") wordlist_file.close() return appended_words
def import_entities(self, filename, types=[], dictionary=None, facet_dictionary_is_tempfile=False, encoding="utf-8"): entity_manager = Entity_Manager() entity_manager.verbose = self.verbose if not dictionary: dictionary = types[0] if not dictionary: dictionary = filename # open and read plaintext file line for line file = open(filename, encoding=encoding) for line in file: value = line.strip() if value: if self.verbose: print("Import entity {}".format(value)) entity_manager.add( id=value, types=types, preferred_label=value, prefLabels=[value], dictionary=dictionary, facet_dictionary_is_tempfile=facet_dictionary_is_tempfile) file.close()
def import_entities(self, filename, types=[], encoding="utf-8"): entity_manager = Entity_Manager() entity_manager.verbose = self.verbose # open and read plaintext file line for line file = open(filename, encoding=encoding) for line in file: value = line.strip() if value: if self.verbose: print("Import entity {}".format(value)) entity_manager.add(id=value, types=types, preferred_label=value, prefLabels=[value]) file.close()
def test(self): # add test entity to entities index entity_manager = Entity_Manager() entity_manager.add( id="http://entity-unittest.local/entities/1", types=['entity-unittest_type_one', 'entity-unittest_type_two'], preferred_label= "entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two", prefLabels=["entity-unittest_preferredLabels"], labels=[ "entity-unittest_labels_one_part_one entity-unittest_labels_one_part_two", "entity-unittest_labels_two", "entity-unittest_labels_umlaut_äöüß" ]) # extracts and normalizes/links all known entities/names/labels linker = Entity_Linker() # check if entity is found by preferred label results = linker.entities( text= "I want to extract the id of entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two from a full text." ) self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' )) # check if is_in_resultdata works ok and does not return true even on not existing id self.assertFalse( is_in_resultdata( resultdata=results, entity_id= 'http://entity-unittest.local/entities/notexistententityid', fieldname='name', value='notexistant entity')) # check returned types of returned entity id self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='type', value=['entity-unittest_type_one', 'entity-unittest_type_two'])) # check if entity is found by another preferred label results = linker.entities( text= "I want to extract the id of entity-unittest_preferredLabels from a full text." ) self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' )) # check if entity is found by (alternate) labels results = linker.entities( text= "I want to extract the id of entity-unittest_labels_one_part_one entity-unittest_labels_one_part_two from a full text." ) self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' )) results = linker.entities( text= "I want to extract the id of entity-unittest_labels_two from a full text." ) self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' )) # check if entity is found by alternate label with special chars results = linker.entities( text= "I want to extract the id of entity-unittest_labels_umlaut_äöüß from a full text." ) self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' )) # entity should not be linked by only a part of the label results = linker.entities( text= "I dont want to extract the id of entity-unittest_labels_one_part_one (missing second part of name) from a full text." ) self.assertFalse( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' ))