Example #1
0
def write_named_entities_config():

    wordlist_configfilename = "/etc/opensemanticsearch/ocr/dictionary.txt"

    tmp_wordlist_configfilename = tempfile.gettempdir() + os.path.sep + next(
        tempfile._get_candidate_names()) + '_ocr_dictionary.txt'

    facets = []

    # create named entities configs for all ontologies
    for ontology in Ontologies.objects.all():

        print("Importing Ontology or List {} (ID: {})".format(
            ontology, ontology.id))

        # Download, if URI
        is_tempfile, filename = get_ontology_file(ontology)

        facet = get_facetname(ontology)

        # analyse content type & encoding
        contenttype, encoding = get_contenttype_and_encoding(filename)
        print("Detected content type: {}".format(contenttype))
        print("Detected encoding: {}".format(encoding))

        #
        # export entries to entities index
        #

        if contenttype == 'application/rdf+xml':

            #
            # write labels, words and synonyms config files
            #

            ontology_tagger = OntologyTagger()

            # load graph from RDF file
            ontology_tagger.parse(filename)

            # add the labels to entities index for normalization and entity linking
            ontology_tagger.solr_entities = 'http://localhost:8983/solr/'
            ontology_tagger.solr_core_entities = 'opensemanticsearch-entities'

            # append synonyms to Solr managed synonyms resource "skos"
            ontology_tagger.solr = 'http://localhost:8983/solr/'
            ontology_tagger.solr_core = 'opensemanticsearch'
            ontology_tagger.synonyms_resourceid = 'skos'

            # append single words of concept labels to wordlist for OCR word dictionary
            ontology_tagger.wordlist_configfile = tmp_wordlist_configfilename

            # additional all labels fields for language dependent / additional analyzers/stemmers
            if ontology.stemming:
                for stemmer in ontology.stemming.split(','):
                    ontology_tagger.additional_all_labels_fields.append(
                        'all_labels_stemming_' + stemmer + '_ss')

            if ontology.stemming_force:
                for stemmer in ontology.stemming_force.split(','):
                    ontology_tagger.additional_all_labels_fields.append(
                        'all_labels_stemming_force_' + stemmer + '_ss')

            if ontology.stemming_hunspell:
                for stemmer in ontology.stemming_hunspell.split(','):
                    ontology_tagger.additional_all_labels_fields.append(
                        'all_labels_stemming_hunspell_' + stemmer + '_ss')

            if ontology.stemming_force_hunspell:
                for stemmer in ontology.stemming_force_hunspell.split(','):
                    ontology_tagger.additional_all_labels_fields.append(
                        'all_labels_stemming_force_hunspell_' + stemmer +
                        '_ss')

            # setup synonyms config and entities index
            ontology_tagger.apply(target_facet=facet)

        elif contenttype.startswith('text/plain'):
            dictionary2wordlist(
                sourcefilename=filename,
                encoding=encoding,
                wordlist_configfilename=tmp_wordlist_configfilename)
            importer = Entity_Importer_List()
            importer.import_entities(filename=filename,
                                     types=[facet],
                                     encoding=encoding)

        else:
            print("Unknown format {}".format(contenttype))

        # remember each new facet for which there a list has been created so we can later write all this facets to schema.xml config part
        if not facet in facets:
            facets.append(facet)

        # Delete if downloaded ontology by URL to tempfile
        if is_tempfile:
            os.remove(filename)

    # Write thesaurus entries to facet entities list(s) / dictionaries, entities index and synonyms
    thesaurus_facets = thesaurus.views.export_entities(
        wordlist_configfilename=tmp_wordlist_configfilename)

    # add facets used in thesaurus but not yet in an ontology to facet config
    for thesaurus_facet in thesaurus_facets:
        if not thesaurus_facet in facets:
            facets.append(thesaurus_facet)

    # Move temp OCR words config file to destination
    if os.path.isfile(tmp_wordlist_configfilename):
        shutil.move(tmp_wordlist_configfilename, wordlist_configfilename)

    # Create config for UI
    write_facet_config()

    # Create config for ETL / entity extraction
    setup.views.generate_etl_configfile()

    # Reload/restart Solr core with new synonyms config
    # Todo: Use the Solr URI from config
    urlopen(
        'http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch'
    )
def write_named_entities_config():

    dictionary_manager = Dictionary_Manager()

    wordlist_configfilename = "/etc/opensemanticsearch/ocr/dictionary.txt"

    tmp_wordlist_configfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + 'tmp_ocr_dictionary.txt'

    facets = []

    # create named entities configs for all ontologies
    for ontology in Ontologies.objects.all():

        print("Importing Ontology or List {} (ID: {})".format(
            ontology, ontology.id))

        # Download, if URI
        is_tempfile, filename = get_ontology_file(ontology)

        facet = get_facetname(ontology)

        # analyse content type & encoding
        contenttype, encoding = get_contenttype_and_encoding(filename)
        print("Detected content type: {}".format(contenttype))
        print("Detected encoding: {}".format(encoding))

        # file to export all labels
        tmplistfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + 'tmp_' + facet + '.txt'

        #
        # export entries to listfiles
        #

        if contenttype == 'application/rdf+xml':

            #
            # write labels, words and synonyms config files
            #

            ontology_tagger = OntologyTagger()

            # load graph from RDF file
            ontology_tagger.parse(filename)

            # add the labels to entities index for normalization and entity linking
            ontology_tagger.solr_entities = 'http://localhost:8983/solr/'
            ontology_tagger.solr_core_entities = 'opensemanticsearch-entities'

            # append synonyms to Solr managed synonyms resource "skos"
            ontology_tagger.solr = 'http://localhost:8983/solr/'
            ontology_tagger.solr_core = 'opensemanticsearch'
            ontology_tagger.synonyms_resourceid = 'skos'

            # append single words of concept labels to wordlist for OCR word dictionary
            ontology_tagger.wordlist_configfile = tmp_wordlist_configfilename

            # append all labels to the facets labels list
            ontology_tagger.labels_configfile = tmplistfilename

            # write synonyms config file
            ontology_tagger.apply(target_facet=facet)

        elif contenttype.startswith('text/plain'):
            append_from_txtfile(
                sourcefilename=filename,
                encoding=encoding,
                wordlist_configfilename=tmp_wordlist_configfilename)
            importer = Entity_Importer_List()
            importer.import_entities(filename=filename,
                                     types=[facet],
                                     dictionary=facet,
                                     facet_dictionary_is_tempfile=True,
                                     encoding=encoding)

        else:
            print("Unknown format {}".format(contenttype))

        # remember each new facet for which there a list has been created so we can later write all this facets to schema.xml config part
        if not facet in facets:
            facets.append(facet)

        # Delete if downloaded ontology by URL to tempfile
        if is_tempfile:
            os.remove(filename)

    # Write thesaurus entries to facet entities list(s) / dictionaries, entities index and synonyms
    thesaurus_facets = thesaurus.views.export_entities(
        wordlist_configfilename=tmp_wordlist_configfilename,
        facet_dictionary_is_tempfile=True)

    # add facets used in thesaurus but not yet in an ontology to facet config
    for thesaurus_facet in thesaurus_facets:
        if not thesaurus_facet in facets:
            facets.append(thesaurus_facet)

    # Move new and complete facet file to destination
    for facet in facets:

        tmplistfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + 'tmp_' + facet + '.txt'
        listfilename = dictionary_manager.solr_dictionary_config_path + os.path.sep + facet + '.txt'
        os.rename(tmplistfilename, listfilename)

    # Move temp synonyms and OCR words config file to destination
    if os.path.isfile(tmp_wordlist_configfilename):
        os.rename(tmp_wordlist_configfilename, wordlist_configfilename)

    # Add facet dictionaries to Open Semantic Entity Search API config
    for facet in facets:

        dictionary_manager.create_dictionary(facet)

    # Create config for UI
    write_facet_config(automatch_facets=facets)

    # Reload/restart Solr core / schema / config to apply changed configs
    # so added config files / ontolgies / facets / new dictionary entries will be considered by analyzing/indexing new documents
    # Todo: Use the Solr URI from config
    urlopen(
        'http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch'
    )
    urlopen(
        'http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch-entities'
    )
def	write_named_entities_config():

	wordlist_configfilename = "/etc/opensemanticsearch/ocr/dictionary.txt"
	
	tmp_wordlist_configfilename = tempfile.gettempdir() + os.path.sep +  next(tempfile._get_candidate_names()) + '_ocr_dictionary.txt'

	facets = []

	# create named entities configs for all ontologies
	for ontology in Ontologies.objects.all():
		
		print ("Importing Ontology or List {} (ID: {})".format( ontology, ontology.id ) )
	
		# Download, if URI
		is_tempfile, filename = get_ontology_file(ontology)
		
		facet = get_facetname(ontology)
	
		# analyse content type & encoding
		contenttype, encoding = get_contenttype_and_encoding(filename)
		print ( "Detected content type: {}".format(contenttype) )
		print ( "Detected encoding: {}".format(encoding) )


		#
		# export entries to entities index
		#
		
		if contenttype=='application/rdf+xml':

			#
			# write labels, words and synonyms config files
			#

			ontology_tagger = OntologyTagger()

			# load graph from RDF file
			ontology_tagger.parse(filename)
			
			# add the labels to entities index for normalization and entity linking
			ontology_tagger.solr_entities = 'http://localhost:8983/solr/'
			ontology_tagger.solr_core_entities = 'opensemanticsearch-entities'
			
			# append synonyms to Solr managed synonyms resource "skos"
			ontology_tagger.solr = 'http://localhost:8983/solr/'
			ontology_tagger.solr_core = 'opensemanticsearch'
			ontology_tagger.synonyms_resourceid = 'skos'

			# append single words of concept labels to wordlist for OCR word dictionary
			ontology_tagger.wordlist_configfile = tmp_wordlist_configfilename
			
			# additional all labels fields for language dependent / additional analyzers/stemmers
			if ontology.stemming:
				for stemmer in ontology.stemming.split(','):
					ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_' + stemmer + '_ss')

			if ontology.stemming_force:
				for stemmer in ontology.stemming_force.split(','):
					ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_force_' + stemmer + '_ss')

			if ontology.stemming_hunspell:
				for stemmer in ontology.stemming_hunspell.split(','):
					ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_hunspell_' + stemmer + '_ss')

			if ontology.stemming_force_hunspell:
				for stemmer in ontology.stemming_force_hunspell.split(','):
					ontology_tagger.additional_all_labels_fields.append('all_labels_stemming_force_hunspell_' + stemmer + '_ss')

			# setup synonyms config and entities index
			ontology_tagger.apply(target_facet=facet)

		elif contenttype.startswith('text/plain'):
			dictionary2wordlist(sourcefilename=filename, encoding=encoding, wordlist_configfilename=tmp_wordlist_configfilename)
			importer = Entity_Importer_List()
			importer.import_entities(filename=filename, types=[facet], encoding=encoding)

		else:
			print ( "Unknown format {}".format(contenttype) )
		
		# remember each new facet for which there a list has been created so we can later write all this facets to schema.xml config part
		if not facet in facets:
			facets.append(facet)
		
		# Delete if downloaded ontology by URL to tempfile
		if is_tempfile:
			os.remove(filename)

	# Write thesaurus entries to facet entities list(s) / dictionaries, entities index and synonyms
	thesaurus_facets = thesaurus.views.export_entities(wordlist_configfilename=tmp_wordlist_configfilename)

	# add facets used in thesaurus but not yet in an ontology to facet config
	for thesaurus_facet in thesaurus_facets:
		if not thesaurus_facet in facets:
			facets.append(thesaurus_facet)

	# Move temp OCR words config file to destination
	if os.path.isfile(tmp_wordlist_configfilename):
		shutil.move(tmp_wordlist_configfilename, wordlist_configfilename)
	
	# Create config for UI
	write_facet_config()
	
	# Create config for ETL / entity extraction
	setup.views.generate_etl_configfile()
	
	# Reload/restart Solr core with new synonyms config
	# Todo: Use the Solr URI from config
	urlopen('http://localhost:8983/solr/admin/cores?action=RELOAD&core=opensemanticsearch')