def write_named_entities_config():

    dictionary_manager = Dictionary_Manager()

    wordlist_configfilename = "/etc/opensemanticsearch/ocr/dictionary.txt"

    tmp_wordlist_configfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + 'tmp_ocr_dictionary.txt'

    facets = []

    # create named entities configs for all ontologies
    for ontology in Ontologies.objects.all():

        print("Importing Ontology or List {} (ID: {})".format(
            ontology, ontology.id))

        # Download, if URI
        is_tempfile, filename = get_ontology_file(ontology)

        facet = get_facetname(ontology)

        # analyse content type & encoding
        contenttype, encoding = get_contenttype_and_encoding(filename)
        print("Detected content type: {}".format(contenttype))
        print("Detected encoding: {}".format(encoding))

        # file to export all labels
        tmplistfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + 'tmp_' + facet + '.txt'

        #
        # export entries to listfiles
        #

        if contenttype == 'application/rdf+xml':

            #
            # write labels, words and synonyms config files
            #

            ontology_tagger = OntologyTagger()

            # load graph from RDF file
            ontology_tagger.parse(filename)

            # add the labels to entities index for normalization and entity linking
            ontology_tagger.solr_entities = 'http://localhost:8983/solr/'
            ontology_tagger.solr_core_entities = 'opensemanticsearch-entities'

            # append synonyms to Solr managed synonyms resource "skos"
            ontology_tagger.solr = 'http://localhost:8983/solr/'
            ontology_tagger.solr_core = 'opensemanticsearch'
            ontology_tagger.synonyms_resourceid = 'skos'

            # append single words of concept labels to wordlist for OCR word dictionary
            ontology_tagger.wordlist_configfile = tmp_wordlist_configfilename

            # append all labels to the facets labels list
            ontology_tagger.labels_configfile = tmplistfilename

            # write synonyms config file
            ontology_tagger.apply(target_facet=facet)

        elif contenttype.startswith('text/plain'):
            append_from_txtfile(
                sourcefilename=filename,
                encoding=encoding,
                wordlist_configfilename=tmp_wordlist_configfilename)
            importer = Entity_Importer_List()
            importer.import_entities(filename=filename,
                                     types=[facet],
                                     dictionary=facet,
                                     facet_dictionary_is_tempfile=True,
                                     encoding=encoding)

        else:
            print("Unknown format {}".format(contenttype))

        # remember each new facet for which there a list has been created so we can later write all this facets to schema.xml config part
        if not facet in facets:
            facets.append(facet)

        # Delete if downloaded ontology by URL to tempfile
        if is_tempfile:
            os.remove(filename)

    # Write thesaurus entries to facet entities list(s) / dictionaries, entities index and synonyms
    thesaurus_facets = thesaurus.views.export_entities(
        wordlist_configfilename=tmp_wordlist_configfilename,
        facet_dictionary_is_tempfile=True)

    # add facets used in thesaurus but not yet in an ontology to facet config
    for thesaurus_facet in thesaurus_facets:
        if not thesaurus_facet in facets:
            facets.append(thesaurus_facet)

    # Move new and complete facet file to destination
    for facet in facets:

        tmplistfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + 'tmp_' + facet + '.txt'
        listfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + facet + '.txt'
        os.rename(tmplistfilename, listfilename)

    # Move temp synonyms and OCR words config file to destination
    if os.path.isfile(tmp_wordlist_configfilename):
        os.rename(tmp_wordlist_configfilename, wordlist_configfilename)

    # Add facet dictionaries to Open Semantic Entity Search API config
    for facet in facets:

        dictionary_manager.create_dictionary(facet)

    # Create config for UI
    write_facet_config(automatch_facets=facets)

    # Reload/restart Solr core / schema / config to apply changed configs
    # so added config files / ontolgies / facets / new dictionary entries will be considered by analyzing/indexing new documents
    # Todo: Use the Solr URI from config
    urlopen(
        'http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch'
    )
    urlopen(
        'http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch-entities'
    )
Beispiel #2
0
def write_named_entities_config():

    wordlist_configfilename = "/etc/opensemanticsearch/ocr/dictionary.txt"

    tmp_wordlist_configfilename = tempfile.gettempdir() + os.path.sep + next(
        tempfile._get_candidate_names()) + '_ocr_dictionary.txt'

    facets = []

    # create named entities configs for all ontologies
    for ontology in Ontologies.objects.all():

        print("Importing Ontology or List {} (ID: {})".format(
            ontology, ontology.id))

        # Download, if URI
        is_tempfile, filename = get_ontology_file(ontology)

        facet = get_facetname(ontology)

        # analyse content type & encoding
        contenttype, encoding = get_contenttype_and_encoding(filename)
        print("Detected content type: {}".format(contenttype))
        print("Detected encoding: {}".format(encoding))

        #
        # export entries to entities index
        #

        if contenttype == 'application/rdf+xml':

            #
            # write labels, words and synonyms config files
            #

            ontology_tagger = OntologyTagger()

            # load graph from RDF file
            ontology_tagger.parse(filename)

            # add the labels to entities index for normalization and entity linking
            ontology_tagger.solr_entities = 'http://localhost:8983/solr/'
            ontology_tagger.solr_core_entities = 'opensemanticsearch-entities'

            # append synonyms to Solr managed synonyms resource "skos"
            ontology_tagger.solr = 'http://localhost:8983/solr/'
            ontology_tagger.solr_core = 'opensemanticsearch'
            ontology_tagger.synonyms_resourceid = 'skos'

            # append single words of concept labels to wordlist for OCR word dictionary
            ontology_tagger.wordlist_configfile = tmp_wordlist_configfilename

            # additional all labels fields for language dependent / additional analyzers/stemmers
            if ontology.stemming:
                for stemmer in ontology.stemming.split(','):
                    ontology_tagger.additional_all_labels_fields.append(
                        'all_labels_stemming_' + stemmer + '_ss')

            if ontology.stemming_force:
                for stemmer in ontology.stemming_force.split(','):
                    ontology_tagger.additional_all_labels_fields.append(
                        'all_labels_stemming_force_' + stemmer + '_ss')

            if ontology.stemming_hunspell:
                for stemmer in ontology.stemming_hunspell.split(','):
                    ontology_tagger.additional_all_labels_fields.append(
                        'all_labels_stemming_hunspell_' + stemmer + '_ss')

            if ontology.stemming_force_hunspell:
                for stemmer in ontology.stemming_force_hunspell.split(','):
                    ontology_tagger.additional_all_labels_fields.append(
                        'all_labels_stemming_force_hunspell_' + stemmer +
                        '_ss')

            # setup synonyms config and entities index
            ontology_tagger.apply(target_facet=facet)

        elif contenttype.startswith('text/plain'):
            dictionary2wordlist(
                sourcefilename=filename,
                encoding=encoding,
                wordlist_configfilename=tmp_wordlist_configfilename)
            importer = Entity_Importer_List()
            importer.import_entities(filename=filename,
                                     types=[facet],
                                     encoding=encoding)

        else:
            print("Unknown format {}".format(contenttype))

        # remember each new facet for which there a list has been created so we can later write all this facets to schema.xml config part
        if not facet in facets:
            facets.append(facet)

        # Delete if downloaded ontology by URL to tempfile
        if is_tempfile:
            os.remove(filename)

    # Write thesaurus entries to facet entities list(s) / dictionaries, entities index and synonyms
    thesaurus_facets = thesaurus.views.export_entities(
        wordlist_configfilename=tmp_wordlist_configfilename)

    # add facets used in thesaurus but not yet in an ontology to facet config
    for thesaurus_facet in thesaurus_facets:
        if not thesaurus_facet in facets:
            facets.append(thesaurus_facet)

    # Move temp OCR words config file to destination
    if os.path.isfile(tmp_wordlist_configfilename):
        shutil.move(tmp_wordlist_configfilename, wordlist_configfilename)

    # Create config for UI
    write_facet_config()

    # Create config for ETL / entity extraction
    setup.views.generate_etl_configfile()

    # Reload/restart Solr core with new synonyms config
    # Todo: Use the Solr URI from config
    urlopen(
        'http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch'
    )
Beispiel #3
0
def write_named_entities_config(request):

    solr_config_path = "/var/solr/data/core1/conf/named_entities"
    wordlist_configfilename = "/etc/opensemanticsearch/ocr/dictionary.txt"

    facets = []

    synonyms_configfilename = solr_config_path + os.path.sep + 'synonyms.txt'

    tmp_synonyms_configfilename = solr_config_path + os.path.sep + 'tmp_synonyms.txt'
    tmp_wordlist_configfilename = solr_config_path + os.path.sep + 'tmp_ocr_dictionary.txt'

    # create empty synonym config file for the case there are no synonyms in ontologies or thesaurus
    if_not_exist_create_empty_list(tmp_synonyms_configfilename)

    # create named entities configs for all ontologies
    for ontology in Ontologies.objects.all():

        try:
            print("Importing Ontology or List {} (ID: {})".format(
                ontology, ontology.id))

            # Download, if URI
            is_tempfile, filename = get_ontology_file(ontology)

            facet = get_facetname(ontology)

            # analyse content type & encoding
            contenttype, encoding = get_contenttype_and_encoding(filename)
            print("Detected content type: {}".format(contenttype))
            print("Detected encoding: {}".format(encoding))

            # file to export all labels
            tmplistfilename = solr_config_path + os.path.sep + 'tmp_' + facet + '.txt'

            #
            # export entries to listfiles
            #

            if contenttype == 'application/rdf+xml':

                #
                # write labels, words and synonyms config files
                #

                ontology_tagger = OntologyTagger()

                # load graph from RDF file
                ontology_tagger.parse(filename)

                # don't tag documents in index, now we want only write the synonyms config
                ontology_tagger.solr = False

                # append synonyms to Solr config file
                ontology_tagger.synonyms_configfile = tmp_synonyms_configfilename

                # append single words of concept labels to wordlist for OCR word dictionary
                ontology_tagger.wordlist_configfile = tmp_wordlist_configfilename

                # append all labels to the facets labels list
                ontology_tagger.labels_configfile = tmplistfilename

                # write synonyms config file
                ontology_tagger.apply()

            elif contenttype.startswith('text/plain'):
                append_from_txtfile(
                    sourcefilename=filename,
                    targetfilename=tmplistfilename,
                    encoding=encoding,
                    wordlist_configfilename=tmp_wordlist_configfilename)

            else:
                # create empty list so configs of field in schema.xml pointing to this file or in facet config of UI will not break
                print("Unknown format {}".format(contenttype))
                if_not_exist_create_empty_list(targetfilename=tmplistfilename)

            # remember each new facet for which there a list has been created so we can later write all this facets to schema.xml config part
            if not facet in facets:
                facets.append(facet)

            # Delete if downloaded ontology by URL to tempfile
            if is_tempfile:
                os.remove(filename)

        except BaseException as e:
            print("Error: Exception while importing ontology {}: {}".format(
                ontology, e))
            messages.add_message(
                request, messages.ERROR,
                "Error: Exception while importing ontology {}: {}".format(
                    ontology, e))

    # Write thesaurus entries to facet entities list / dictionary
    thesaurus_facets = thesaurus.views.append_thesaurus_labels_to_dictionaries(
        synoynms_configfilename=tmp_synonyms_configfilename)

    # Append single words of concept labels to wordlist for OCR word dictionary
    thesaurus.views.append_concept_words_to_wordlist(
        wordlist_configfilename=tmp_wordlist_configfilename)

    # add facets used in thesaurus but not yet in an ontology to facet config
    for thesaurus_facet in thesaurus_facets:
        if not thesaurus_facet in facets:
            facets.append(thesaurus_facet)

    # Move new and complete facet file to destination
    for facet in facets:

        tmplistfilename = solr_config_path + os.path.sep + 'tmp_' + facet + '.txt'
        listfilename = solr_config_path + os.path.sep + facet + '.txt'
        os.rename(tmplistfilename, listfilename)

    # Move temp synonyms and OCR words config file to destination
    os.rename(tmp_synonyms_configfilename, synonyms_configfilename)
    os.rename(tmp_wordlist_configfilename, wordlist_configfilename)

    # Create config for schema.xml include for all facets
    configfilename = solr_config_path + os.path.sep + 'schema_named_entities.xml'
    write_solr_schema_config(configfilename, facets)

    # Create config for UI
    write_facet_config(automatch_facets=facets)

    # Reload/restart Solr core / schema / config to apply changed configs
    # so added config files / ontolgies / facets / new dictionary entries will be considered by analyzing/indexing new documents
    # Todo: Use the Solr URI from config
    solr_url = os.getenv('OSS_SOLR_URL', default='http://localhost:8983/solr/')
    urlopen(solr_url + 'admin/cores?action=RELOAD&core=core1')
def	write_named_entities_config():

	wordlist_configfilename = "/etc/opensemanticsearch/ocr/dictionary.txt"
	
	tmp_wordlist_configfilename = tempfile.gettempdir() + os.path.sep +  next(tempfile._get_candidate_names()) + '_ocr_dictionary.txt'

	facets = []

	# create named entities configs for all ontologies
	for ontology in Ontologies.objects.all():
		
		print ("Importing Ontology or List {} (ID: {})".format( ontology, ontology.id ) )
	
		# Download, if URI
		is_tempfile, filename = get_ontology_file(ontology)
		
		facet = get_facetname(ontology)
	
		# analyse content type & encoding
		contenttype, encoding = get_contenttype_and_encoding(filename)
		print ( "Detected content type: {}".format(contenttype) )
		print ( "Detected encoding: {}".format(encoding) )


		#
		# export entries to entities index
		#
		
		if contenttype=='application/rdf+xml':

			#
			# write labels, words and synonyms config files
			#

			ontology_tagger = OntologyTagger()

			# load graph from RDF file
			ontology_tagger.parse(filename)
			
			# add the labels to entities index for normalization and entity linking
			ontology_tagger.solr_entities = 'http://localhost:8983/solr/'
			ontology_tagger.solr_core_entities = 'opensemanticsearch-entities'
			
			# append synonyms to Solr managed synonyms resource "skos"
			ontology_tagger.solr = 'http://localhost:8983/solr/'
			ontology_tagger.solr_core = 'opensemanticsearch'
			ontology_tagger.synonyms_resourceid = 'skos'

			# append single words of concept labels to wordlist for OCR word dictionary
			ontology_tagger.wordlist_configfile = tmp_wordlist_configfilename
			
			# additional all labels fields for language dependent / additional analyzers/stemmers
			if ontology.stemming:
				for stemmer in ontology.stemming.split(','):
					ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_' + stemmer + '_ss')

			if ontology.stemming_force:
				for stemmer in ontology.stemming_force.split(','):
					ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_force_' + stemmer + '_ss')

			if ontology.stemming_hunspell:
				for stemmer in ontology.stemming_hunspell.split(','):
					ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_hunspell_' + stemmer + '_ss')

			if ontology.stemming_force_hunspell:
				for stemmer in ontology.stemming_force_hunspell.split(','):
					ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_force_hunspell_' + stemmer + '_ss')

			# setup synonyms config and entities index
			ontology_tagger.apply(target_facet=facet)

		elif contenttype.startswith('text/plain'):
			dictionary2wordlist(sourcefilename=filename, encoding=encoding, wordlist_configfilename=tmp_wordlist_configfilename)
			importer = Entity_Importer_List()
			importer.import_entities(filename=filename, types=[facet], encoding=encoding)

		else:
			print ( "Unknown format {}".format(contenttype) )
		
		# remember each new facet for which there a list has been created so we can later write all this facets to schema.xml config part
		if not facet in facets:
			facets.append(facet)
		
		# Delete if downloaded ontology by URL to tempfile
		if is_tempfile:
			os.remove(filename)

	# Write thesaurus entries to facet entities list(s) / dictionaries, entities index and synonyms
	thesaurus_facets = thesaurus.views.export_entities(wordlist_configfilename=tmp_wordlist_configfilename)

	# add facets used in thesaurus but not yet in an ontology to facet config
	for thesaurus_facet in thesaurus_facets:
		if not thesaurus_facet in facets:
			facets.append(thesaurus_facet)

	# Move temp OCR words config file to destination
	if os.path.isfile(tmp_wordlist_configfilename):
		shutil.move(tmp_wordlist_configfilename, wordlist_configfilename)
	
	# Create config for UI
	write_facet_config()
	
	# Create config for ETL / entity extraction
	setup.views.generate_etl_configfile()
	
	# Reload/restart Solr core with new synonyms config
	# Todo: Use the Solr URI from config
	urlopen('http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch')