def tag_by_ontology(ontology): # get the ontology file is_tempfile, filename = get_ontology_file(ontology) facet = get_facetname(ontology) contenttype, encoding = get_contenttype_and_encoding(filename) if contenttype == 'application/rdf+xml': ontology_tagger = OntologyTagger() #load graph from RDF file ontology_tagger.parse(filename) # tag the documents on Solr server with all matching entities of the ontology ontology_tagger.tag = True ontology_tagger.apply(target_facet=facet) elif contenttype.startswith('text/plain'): tag_by_list(filename=filename, field=facet, encoding=encoding) else: # create empty list so configs of field in schema.xml pointing to this file or in facet config of UI will not break print("Unknown format {}".format(contenttype)) # # Delete if downloaded ontology by URL to tempfile # if is_tempfile: os.remove(filename)
def tag_by_ontology(ontology): # get the ontology file is_tempfile, filename = get_ontology_file(ontology) facet = get_facetname(ontology) contenttype, encoding = get_contenttype_and_encoding(filename) queryfields = " ".join(get_stemmed_fields()) if contenttype == 'application/rdf+xml': ontology_tagger = OntologyTagger() #load graph from RDF file ontology_tagger.parse(filename) # tag the documents on Solr server with all matching entities of the ontology ontology_tagger.tag = True ontology_tagger.apply(target_facet=facet, queryfields=queryfields) elif contenttype.startswith('text/plain'): tag_by_list(filename=filename, field=facet, encoding=encoding, queryfields=queryfields) else: # create empty list so configs of field in schema.xml pointing to this file or in facet config of UI will not break print ( "Unknown format {}".format(contenttype) ) # # Delete if downloaded ontology by URL to tempfile # if is_tempfile: os.remove(filename)
def write_named_entities_config(): dictionary_manager = Dictionary_Manager() wordlist_configfilename = "/etc/opensemanticsearch/ocr/dictionary.txt" tmp_wordlist_configfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + 'tmp_ocr_dictionary.txt' facets = [] # create named entities configs for all ontologies for ontology in Ontologies.objects.all(): print("Importing Ontology or List {} (ID: {})".format( ontology, ontology.id)) # Download, if URI is_tempfile, filename = get_ontology_file(ontology) facet = get_facetname(ontology) # analyse content type & encoding contenttype, encoding = get_contenttype_and_encoding(filename) print("Detected content type: {}".format(contenttype)) print("Detected encoding: {}".format(encoding)) # file to export all labels tmplistfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + 'tmp_' + facet + '.txt' # # export entries to listfiles # if contenttype == 'application/rdf+xml': # # write labels, words and synonyms config files # ontology_tagger = OntologyTagger() # load graph from RDF file ontology_tagger.parse(filename) # add the labels to entities index for normalization and entity linking ontology_tagger.solr_entities = 'http://localhost:8983/solr/' ontology_tagger.solr_core_entities = 'opensemanticsearch-entities' # append synonyms to Solr managed synonyms resource "skos" ontology_tagger.solr = 'http://localhost:8983/solr/' ontology_tagger.solr_core = 'opensemanticsearch' ontology_tagger.synonyms_resourceid = 'skos' # append single words of concept labels to wordlist for OCR word dictionary ontology_tagger.wordlist_configfile = tmp_wordlist_configfilename # append all labels to the facets labels list ontology_tagger.labels_configfile = tmplistfilename # write synonyms config file ontology_tagger.apply(target_facet=facet) elif contenttype.startswith('text/plain'): append_from_txtfile( sourcefilename=filename, encoding=encoding, wordlist_configfilename=tmp_wordlist_configfilename) importer = Entity_Importer_List() importer.import_entities(filename=filename, types=[facet], dictionary=facet, facet_dictionary_is_tempfile=True, encoding=encoding) else: print("Unknown format {}".format(contenttype)) # remember each new facet for which there a list has been created so we can later write all this facets to schema.xml config part if not facet in facets: facets.append(facet) # Delete if downloaded ontology by URL to tempfile if is_tempfile: os.remove(filename) # Write thesaurus entries to facet entities list(s) / dictionaries, entities index and synonyms thesaurus_facets = thesaurus.views.export_entities( wordlist_configfilename=tmp_wordlist_configfilename, facet_dictionary_is_tempfile=True) # add facets used in thesaurus but not yet in an ontology to facet config for thesaurus_facet in thesaurus_facets: if not thesaurus_facet in facets: facets.append(thesaurus_facet) # Move new and complete facet file to destination for facet in facets: tmplistfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + 'tmp_' + facet + '.txt' listfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + facet + '.txt' os.rename(tmplistfilename, listfilename) # Move temp synonyms and OCR words config file to destination if os.path.isfile(tmp_wordlist_configfilename): os.rename(tmp_wordlist_configfilename, wordlist_configfilename) # Add facet dictionaries to Open Semantic Entity Search API config for facet in facets: dictionary_manager.create_dictionary(facet) # Create config for UI write_facet_config(automatch_facets=facets) # Reload/restart Solr core / schema / config to apply changed configs # so added config files / ontolgies / facets / new dictionary entries will be considered by analyzing/indexing new documents # Todo: Use the Solr URI from config urlopen( 'http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch' ) urlopen( 'http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch-entities' )
def write_named_entities_config(request): solr_config_path = "/var/solr/data/core1/conf/named_entities" wordlist_configfilename = "/etc/opensemanticsearch/ocr/dictionary.txt" facets = [] synonyms_configfilename = solr_config_path + os.path.sep + 'synonyms.txt' tmp_synonyms_configfilename = solr_config_path + os.path.sep + 'tmp_synonyms.txt' tmp_wordlist_configfilename = solr_config_path + os.path.sep + 'tmp_ocr_dictionary.txt' # create empty synonym config file for the case there are no synonyms in ontologies or thesaurus if_not_exist_create_empty_list(tmp_synonyms_configfilename) # create named entities configs for all ontologies for ontology in Ontologies.objects.all(): try: print("Importing Ontology or List {} (ID: {})".format( ontology, ontology.id)) # Download, if URI is_tempfile, filename = get_ontology_file(ontology) facet = get_facetname(ontology) # analyse content type & encoding contenttype, encoding = get_contenttype_and_encoding(filename) print("Detected content type: {}".format(contenttype)) print("Detected encoding: {}".format(encoding)) # file to export all labels tmplistfilename = solr_config_path + os.path.sep + 'tmp_' + facet + '.txt' # # export entries to listfiles # if contenttype == 'application/rdf+xml': # # write labels, words and synonyms config files # ontology_tagger = OntologyTagger() # load graph from RDF file ontology_tagger.parse(filename) # don't tag documents in index, now we want only write the synonyms config ontology_tagger.solr = False # append synonyms to Solr config file ontology_tagger.synonyms_configfile = tmp_synonyms_configfilename # append single words of concept labels to wordlist for OCR word dictionary ontology_tagger.wordlist_configfile = tmp_wordlist_configfilename # append all labels to the facets labels list ontology_tagger.labels_configfile = tmplistfilename # write synonyms config file ontology_tagger.apply() elif contenttype.startswith('text/plain'): append_from_txtfile( sourcefilename=filename, targetfilename=tmplistfilename, encoding=encoding, wordlist_configfilename=tmp_wordlist_configfilename) else: # create empty list so configs of field in schema.xml pointing to this file or in facet config of UI will not break print("Unknown format {}".format(contenttype)) if_not_exist_create_empty_list(targetfilename=tmplistfilename) # remember each new facet for which there a list has been created so we can later write all this facets to schema.xml config part if not facet in facets: facets.append(facet) # Delete if downloaded ontology by URL to tempfile if is_tempfile: os.remove(filename) except BaseException as e: print("Error: Exception while importing ontology {}: {}".format( ontology, e)) messages.add_message( request, messages.ERROR, "Error: Exception while importing ontology {}: {}".format( ontology, e)) # Write thesaurus entries to facet entities list / dictionary thesaurus_facets = thesaurus.views.append_thesaurus_labels_to_dictionaries( synoynms_configfilename=tmp_synonyms_configfilename) # Append single words of concept labels to wordlist for OCR word dictionary thesaurus.views.append_concept_words_to_wordlist( wordlist_configfilename=tmp_wordlist_configfilename) # add facets used in thesaurus but not yet in an ontology to facet config for thesaurus_facet in thesaurus_facets: if not thesaurus_facet in facets: facets.append(thesaurus_facet) # Move new and complete facet file to destination for facet in facets: tmplistfilename = solr_config_path + os.path.sep + 'tmp_' + facet + '.txt' listfilename = solr_config_path + os.path.sep + facet + '.txt' os.rename(tmplistfilename, listfilename) # Move temp synonyms and OCR words config file to destination os.rename(tmp_synonyms_configfilename, synonyms_configfilename) os.rename(tmp_wordlist_configfilename, wordlist_configfilename) # Create config for schema.xml include for all facets configfilename = solr_config_path + os.path.sep + 'schema_named_entities.xml' write_solr_schema_config(configfilename, facets) # Create config for UI write_facet_config(automatch_facets=facets) # Reload/restart Solr core / schema / config to apply changed configs # so added config files / ontolgies / facets / new dictionary entries will be considered by analyzing/indexing new documents # Todo: Use the Solr URI from config solr_url = os.getenv('OSS_SOLR_URL', default='http://localhost:8983/solr/') urlopen(solr_url + 'admin/cores?action=RELOAD&core=core1')
def write_named_entities_config(): wordlist_configfilename = "/etc/opensemanticsearch/ocr/dictionary.txt" tmp_wordlist_configfilename = tempfile.gettempdir() + os.path.sep + next( tempfile._get_candidate_names()) + '_ocr_dictionary.txt' facets = [] # create named entities configs for all ontologies for ontology in Ontologies.objects.all(): print("Importing Ontology or List {} (ID: {})".format( ontology, ontology.id)) # Download, if URI is_tempfile, filename = get_ontology_file(ontology) facet = get_facetname(ontology) # analyse content type & encoding contenttype, encoding = get_contenttype_and_encoding(filename) print("Detected content type: {}".format(contenttype)) print("Detected encoding: {}".format(encoding)) # # export entries to entities index # if contenttype == 'application/rdf+xml': # # write labels, words and synonyms config files # ontology_tagger = OntologyTagger() # load graph from RDF file ontology_tagger.parse(filename) # add the labels to entities index for normalization and entity linking ontology_tagger.solr_entities = 'http://localhost:8983/solr/' ontology_tagger.solr_core_entities = 'opensemanticsearch-entities' # append synonyms to Solr managed synonyms resource "skos" ontology_tagger.solr = 'http://localhost:8983/solr/' ontology_tagger.solr_core = 'opensemanticsearch' ontology_tagger.synonyms_resourceid = 'skos' # append single words of concept labels to wordlist for OCR word dictionary ontology_tagger.wordlist_configfile = tmp_wordlist_configfilename # additional all labels fields for language dependent / additional analyzers/stemmers if ontology.stemming: for stemmer in ontology.stemming.split(','): ontology_tagger.additional_all_labels_fields.append( 'all_labels_stemming_' + stemmer + '_ss') if ontology.stemming_force: for stemmer in ontology.stemming_force.split(','): ontology_tagger.additional_all_labels_fields.append( 'all_labels_stemming_force_' + stemmer + '_ss') if ontology.stemming_hunspell: for stemmer in ontology.stemming_hunspell.split(','): ontology_tagger.additional_all_labels_fields.append( 'all_labels_stemming_hunspell_' + stemmer + '_ss') if ontology.stemming_force_hunspell: for stemmer in ontology.stemming_force_hunspell.split(','): ontology_tagger.additional_all_labels_fields.append( 'all_labels_stemming_force_hunspell_' + stemmer + '_ss') # setup synonyms config and entities index ontology_tagger.apply(target_facet=facet) elif contenttype.startswith('text/plain'): dictionary2wordlist( sourcefilename=filename, encoding=encoding, wordlist_configfilename=tmp_wordlist_configfilename) importer = Entity_Importer_List() importer.import_entities(filename=filename, types=[facet], encoding=encoding) else: print("Unknown format {}".format(contenttype)) # remember each new facet for which there a list has been created so we can later write all this facets to schema.xml config part if not facet in facets: facets.append(facet) # Delete if downloaded ontology by URL to tempfile if is_tempfile: os.remove(filename) # Write thesaurus entries to facet entities list(s) / dictionaries, entities index and synonyms thesaurus_facets = thesaurus.views.export_entities( wordlist_configfilename=tmp_wordlist_configfilename) # add facets used in thesaurus but not yet in an ontology to facet config for thesaurus_facet in thesaurus_facets: if not thesaurus_facet in facets: facets.append(thesaurus_facet) # Move temp OCR words config file to destination if os.path.isfile(tmp_wordlist_configfilename): shutil.move(tmp_wordlist_configfilename, wordlist_configfilename) # Create config for UI write_facet_config() # Create config for ETL / entity extraction setup.views.generate_etl_configfile() # Reload/restart Solr core with new synonyms config # Todo: Use the Solr URI from config urlopen( 'http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch' )
def write_named_entities_config(): wordlist_configfilename = "/etc/opensemanticsearch/ocr/dictionary.txt" tmp_wordlist_configfilename = tempfile.gettempdir() + os.path.sep + next(tempfile._get_candidate_names()) + '_ocr_dictionary.txt' facets = [] # create named entities configs for all ontologies for ontology in Ontologies.objects.all(): print ("Importing Ontology or List {} (ID: {})".format( ontology, ontology.id ) ) # Download, if URI is_tempfile, filename = get_ontology_file(ontology) facet = get_facetname(ontology) # analyse content type & encoding contenttype, encoding = get_contenttype_and_encoding(filename) print ( "Detected content type: {}".format(contenttype) ) print ( "Detected encoding: {}".format(encoding) ) # # export entries to entities index # if contenttype=='application/rdf+xml': # # write labels, words and synonyms config files # ontology_tagger = OntologyTagger() # load graph from RDF file ontology_tagger.parse(filename) # add the labels to entities index for normalization and entity linking ontology_tagger.solr_entities = 'http://localhost:8983/solr/' ontology_tagger.solr_core_entities = 'opensemanticsearch-entities' # append synonyms to Solr managed synonyms resource "skos" ontology_tagger.solr = 'http://localhost:8983/solr/' ontology_tagger.solr_core = 'opensemanticsearch' ontology_tagger.synonyms_resourceid = 'skos' # append single words of concept labels to wordlist for OCR word dictionary ontology_tagger.wordlist_configfile = tmp_wordlist_configfilename # additional all labels fields for language dependent / additional analyzers/stemmers if ontology.stemming: for stemmer in ontology.stemming.split(','): ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_' + stemmer + '_ss') if ontology.stemming_force: for stemmer in ontology.stemming_force.split(','): ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_force_' + stemmer + '_ss') if ontology.stemming_hunspell: for stemmer in ontology.stemming_hunspell.split(','): ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_hunspell_' + stemmer + '_ss') if ontology.stemming_force_hunspell: for stemmer in ontology.stemming_force_hunspell.split(','): ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_force_hunspell_' + stemmer + '_ss') # setup synonyms config and entities index ontology_tagger.apply(target_facet=facet) elif contenttype.startswith('text/plain'): dictionary2wordlist(sourcefilename=filename, encoding=encoding, wordlist_configfilename=tmp_wordlist_configfilename) importer = Entity_Importer_List() importer.import_entities(filename=filename, types=[facet], encoding=encoding) else: print ( "Unknown format {}".format(contenttype) ) # remember each new facet for which there a list has been created so we can later write all this facets to schema.xml config part if not facet in facets: facets.append(facet) # Delete if downloaded ontology by URL to tempfile if is_tempfile: os.remove(filename) # Write thesaurus entries to facet entities list(s) / dictionaries, entities index and synonyms thesaurus_facets = thesaurus.views.export_entities(wordlist_configfilename=tmp_wordlist_configfilename) # add facets used in thesaurus but not yet in an ontology to facet config for thesaurus_facet in thesaurus_facets: if not thesaurus_facet in facets: facets.append(thesaurus_facet) # Move temp OCR words config file to destination if os.path.isfile(tmp_wordlist_configfilename): shutil.move(tmp_wordlist_configfilename, wordlist_configfilename) # Create config for UI write_facet_config() # Create config for ETL / entity extraction setup.views.generate_etl_configfile() # Reload/restart Solr core with new synonyms config # Todo: Use the Solr URI from config urlopen('http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch')