def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        openrefine_server = False

        text = ''
        for field in data:

            values = data[field]

            if not isinstance(values, list):
                values = [values]

            for value in values:
                if value:
                    text = "{}{}\n".format(text, value)

        if openrefine_server:
            # use REST-API on (remote) HTTP server
            params = {'text': text}
            r = requests.post(openrefine_server, params=params)
            results = r.json()

        else:
            # use local Python library
            linker = Entity_Linker()
            linker.verbose = verbose

            results = linker.entities(text=text)

        if verbose:
            print("Named Entity Linking: {}".format(results))

        for match in results:
            for candidate in results[match]['result']:
                if candidate['match']:
                    for facet in candidate['type']:
                        etl.append(data, facet, candidate['name'])
                        etl.append(data, facet + '_uri_ss', candidate['id'])

        # mark the document, that it was analyzed by this plugin yet
        data['enhance_entity_linking_b'] = "true"

        return parameters, data
    def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        openrefine_server = False

        analyse_fields = [
            'title_txt', 'content_txt', 'description_txt', 'ocr_t',
            'ocr_descew_t'
        ]

        text = ''
        for field in analyse_fields:
            if field in data:
                text = "{}{}\n".format(text, data[field])

        if openrefine_server:
            # use REST-API on (remote) HTTP server
            params = {'text': text}
            r = requests.post(openrefine_server, params=params)
            results = r.json()

        else:
            # use local Python library
            linker = Entity_Linker()
            linker.verbose = verbose

            results = linker.entities(text=text)

        if verbose:
            print("Named Entity Linking: {}".format(results))

        for match in results:
            for candidate in results[match]['result']:

                for facet in candidate['type']:
                    etl.append(data, facet, candidate['name'])
                    etl.append(data, facet + '_uri_ss', candidate['id'])

        # mark the document, that it was analyzed by this plugin yet
        data['enhance_entity_linking_b'] = "true"

        return parameters, data
def reconcile(request):
	queries = None
	if 'queries' in request.GET:
		queries = json.loads(request.GET['queries'])
	elif 'queries' in request.POST:
		queries = json.loads(request.POST['queries'])
		
	text = None
	if 'text' in request.POST:
		text = request.POST['text']
	elif 'text' in request.GET:
		text = request.GET['text']

	if queries or text:

		# link/normalize/disambiguate entities
		entity_linker = Entity_Linker()
		results = entity_linker.entities(queries=queries, text=text)

	else:
	
		# no queries, so just return service metadata		
		results = {
			'name': 'Open Semantic Entity Search API',
		}
	
	# Open Refine uses JSONP callback
	callback = None
	if 'callback' in request.GET:
		callback = request.GET['callback']
	elif 'callback' in request.POST:
		callback = request.POST['callback']

	if callback:
		# JSONP response instead of Jsonresponse
		results = '{}({});'.format( callback, json.dumps(results) )
		return HttpResponse(results, "text/javascript")
	else:
		return JsonResponse(results)
Example #4
0
    def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        entity_linking_taggers = ['all_labels_ss_tag']
        if 'entity_linking_taggers' in parameters:
            entity_linking_taggers = parameters['entity_linking_taggers']

        # add taggers for stemming
        entity_linking_taggers_document_language_dependent = {}
        if 'entity_linking_taggers_document_language_dependent' in parameters:
            entity_linking_taggers_document_language_dependent = parameters[
                'entity_linking_taggers_document_language_dependent']

        if 'language_s' in data:
            # is a language specific tagger there for the detected language?
            if data['language_s'] in entity_linking_taggers_document_language_dependent:
                for entity_linking_tagger in entity_linking_taggers_document_language_dependent[
                        data['language_s']]:
                    if not entity_linking_tagger in entity_linking_taggers:
                        entity_linking_taggers.append(entity_linking_tagger)

        openrefine_server = False
        if 'openrefine_server' in parameters:
            openrefine_server = parameters['openrefine_server']

        taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss']

        # collect/copy to be analyzed text from all fields
        text = ''
        for field in data:

            values = data[field]

            if not isinstance(values, list):
                values = [values]

            for value in values:
                if value:
                    text = "{}{}\n".format(text, value)

        # tag all entities (by different taggers for different analyzers/stemmers)
        for entity_linking_tagger in entity_linking_taggers:

            # call REST API
            if openrefine_server:
                # use REST-API on (remote) HTTP server
                params = {'text': text}
                r = requests.post(openrefine_server, params=params)
                results = r.json()

            else:
                # use local Python library
                linker = Entity_Linker()
                linker.verbose = verbose

                results = linker.entities(
                    text=text,
                    taggers=[entity_linking_tagger],
                    additional_result_fields=taxonomy_fields)

            if verbose:
                print("Named Entity Linking by Tagger {}: {}".format(
                    entity_linking_tagger, results))

            # write entities from result to document facets
            for match in results:
                for candidate in results[match]['result']:
                    if candidate['match']:
                        for facet in candidate['type']:

                            # use different facet for fuzzy/stemmed matches
                            if not entity_linking_tagger == 'all_labels_ss_tag':
                                # do not use another different facet if same stemmer but forced / not document language dependent
                                entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace(
                                    '_stemming_force_', '_stemming_')
                                facet = facet + entity_linking_tagger_withoutforceoption + '_ss'

                            etl.append(data, facet, candidate['name'])
                            etl.append(data, facet + '_uri_ss',
                                       candidate['id'])
                            etl.append(
                                data, facet + '_preflabel_and_uri_ss',
                                candidate['name'] + ' <' + candidate['id'] +
                                '>')

                            if 'matchtext' in candidate:
                                for matchtext in candidate['matchtext']:
                                    etl.append(
                                        data, facet + '_matchtext_ss',
                                        candidate['id'] + "\t" + matchtext)

                            for taxonomy_field in taxonomy_fields:
                                if taxonomy_field in candidate:
                                    separated_taxonomy_fields = taxonomy2fields(
                                        field=facet,
                                        data=candidate[taxonomy_field])
                                    for separated_taxonomy_field in separated_taxonomy_fields:
                                        etl.append(
                                            data, separated_taxonomy_field,
                                            separated_taxonomy_fields[
                                                separated_taxonomy_field])

        # mark the document, that it was analyzed by this plugin yet
        data['etl_enhance_entity_linking_b'] = "true"

        return parameters, data
    def process(self, parameters=None, data=None):
        if parameters is None:
            parameters = {}
        if data is None:
            data = {}

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        entity_linking_taggers = ['all_labels_ss_tag']
        if 'entity_linking_taggers' in parameters:
            entity_linking_taggers = parameters['entity_linking_taggers']

        # add taggers for stemming
        entity_linking_taggers_document_language_dependent = {}
        if 'entity_linking_taggers_document_language_dependent' in parameters:
            entity_linking_taggers_document_language_dependent = parameters[
                'entity_linking_taggers_document_language_dependent']

        if 'language_s' in data:
            # is a language specific tagger there for the detected language?
            if data['language_s'] in entity_linking_taggers_document_language_dependent:
                for entity_linking_tagger in entity_linking_taggers_document_language_dependent[
                        data['language_s']]:
                    if not entity_linking_tagger in entity_linking_taggers:
                        entity_linking_taggers.append(entity_linking_tagger)

        openrefine_server = False
        if 'openrefine_server' in parameters:
            openrefine_server = parameters['openrefine_server']

        taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss']

        # collect/copy to be analyzed text from all fields
        text = etl_plugin_core.get_text(data=data)

        # tag all entities (by different taggers for different analyzers/stemmers)
        for entity_linking_tagger in entity_linking_taggers:

            results = {}

            retries = 0
            retrytime = 1
            # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry
            retrytime_max = 120
            no_connection = True

            while no_connection:
                try:
                    if retries > 0:
                        print(
                            'Retrying to connect to Solr tagger in {} second(s).'
                            .format(retrytime))
                        time.sleep(retrytime)
                        retrytime = retrytime * 2
                        if retrytime > retrytime_max:
                            retrytime = retrytime_max

                    # call REST API
                    if openrefine_server:
                        # use REST-API on (remote) HTTP server
                        params = {'text': text}
                        r = requests.post(openrefine_server, params=params)
                        # if bad status code, raise exception
                        r.raise_for_status()

                        results = r.json()

                    else:
                        # use local Python library
                        linker = Entity_Linker()
                        linker.verbose = verbose

                        results = linker.entities(
                            text=text,
                            taggers=[entity_linking_tagger],
                            additional_result_fields=taxonomy_fields)

                    no_connection = False

                except KeyboardInterrupt:
                    raise KeyboardInterrupt

                except requests.exceptions.ConnectionError as e:

                    retries += 1

                    if openrefine_server:
                        sys.stderr.write(
                            "Connection to Openrefine server failed (will retry in {} seconds). Exception: {}\n"
                            .format(retrytime, e))
                    else:
                        sys.stderr.write(
                            "Connection to Solr text tagger failed (will retry in {} seconds). Exception: {}\n"
                            .format(retrytime, e))

                except requests.exceptions.HTTPError as e:
                    if e.response.status_code == 503:

                        retries += 1

                        if openrefine_server:
                            sys.stderr.write(
                                "Openrefine server temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n"
                                .format(retrytime, e))
                        else:
                            sys.stderr.write(
                                "Solr temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n"
                                .format(retrytime, e))

                    elif e.response.status_code == 400:
                        no_connection = False

                        # if error because of empty entity index for that tagger because no entities imported yet, no error message / index as fail
                        empty_entity_index = False
                        try:
                            errorstatus = e.response.json()
                            if errorstatus['error'][
                                    'msg'] == 'field ' + entity_linking_tagger + ' has no indexed data':
                                empty_entity_index = True
                        except:
                            pass

                        if not empty_entity_index:
                            etl.error_message(docid=parameters['id'],
                                              data=data,
                                              plugin='enhance_entity_linking',
                                              e=e)

                    else:
                        no_connection = False
                        etl.error_message(docid=parameters['id'],
                                          data=data,
                                          plugin='enhance_entity_linking',
                                          e=e)

                except BaseException as e:
                    no_connection = False
                    etl.error_message(docid=parameters['id'],
                                      data=data,
                                      plugin='enhance_entity_linking',
                                      e=e)

            if verbose:
                print("Named Entity Linking by Tagger {}: {}".format(
                    entity_linking_tagger, results))

            # write entities from result to document facets
            for match in results:
                for candidate in results[match]['result']:
                    if candidate['match']:
                        for facet in candidate['type']:

                            # use different facet for fuzzy/stemmed matches
                            if not entity_linking_tagger == 'all_labels_ss_tag':
                                # do not use another different facet if same stemmer but forced / not document language dependent
                                entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace(
                                    '_stemming_force_', '_stemming_')
                                facet = facet + entity_linking_tagger_withoutforceoption + '_ss'

                            etl_plugin_core.append(data, facet,
                                                   candidate['name'])
                            etl_plugin_core.append(data, facet + '_uri_ss',
                                                   candidate['id'])
                            etl_plugin_core.append(
                                data, facet + '_preflabel_and_uri_ss',
                                candidate['name'] + ' <' + candidate['id'] +
                                '>')

                            if 'matchtext' in candidate:
                                for matchtext in candidate['matchtext']:
                                    etl_plugin_core.append(
                                        data, facet + '_matchtext_ss',
                                        candidate['id'] + "\t" + matchtext)

                            for taxonomy_field in taxonomy_fields:
                                if taxonomy_field in candidate:
                                    separated_taxonomy_fields = taxonomy2fields(
                                        taxonomy=candidate[taxonomy_field],
                                        field=facet)
                                    for separated_taxonomy_field in separated_taxonomy_fields:
                                        etl_plugin_core.append(
                                            data, separated_taxonomy_field,
                                            separated_taxonomy_fields[
                                                separated_taxonomy_field])

        return parameters, data
    def test(self):

        # add test entity to entities index
        entity_manager = Entity_Manager()

        entity_manager.add(
            id="http://entity-unittest.local/entities/1",
            types=['entity-unittest_type_one', 'entity-unittest_type_two'],
            preferred_label=
            "entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two",
            prefLabels=["entity-unittest_preferredLabels"],
            labels=[
                "entity-unittest_labels_one_part_one entity-unittest_labels_one_part_two",
                "entity-unittest_labels_two",
                "entity-unittest_labels_umlaut_äöüß"
            ])

        # extracts and normalizes/links all known entities/names/labels
        linker = Entity_Linker()

        # check if entity is found by preferred label
        results = linker.entities(
            text=
            "I want to extract the id of entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two from a full text."
        )

        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))

        # check if is_in_resultdata works ok and does not return true even on not existing id
        self.assertFalse(
            is_in_resultdata(
                resultdata=results,
                entity_id=
                'http://entity-unittest.local/entities/notexistententityid',
                fieldname='name',
                value='notexistant entity'))

        # check returned types of returned entity id
        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='type',
                value=['entity-unittest_type_one',
                       'entity-unittest_type_two']))

        # check if entity is found by another preferred label
        results = linker.entities(
            text=
            "I want to extract the id of entity-unittest_preferredLabels from a full text."
        )
        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))

        # check if entity is found by (alternate) labels
        results = linker.entities(
            text=
            "I want to extract the id of entity-unittest_labels_one_part_one entity-unittest_labels_one_part_two from a full text."
        )
        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))

        results = linker.entities(
            text=
            "I want to extract the id of entity-unittest_labels_two from a full text."
        )
        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))

        # check if entity is found by alternate label with special chars
        results = linker.entities(
            text=
            "I want to extract the id of entity-unittest_labels_umlaut_äöüß from a full text."
        )
        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))

        # entity should not be linked by only a part of the label
        results = linker.entities(
            text=
            "I dont want to extract the id of entity-unittest_labels_one_part_one (missing second part of name) from a full text."
        )
        self.assertFalse(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))
    def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        entity_linking_taggers = ['all_labels_ss_tag']
        if 'entity_linking_taggers' in parameters:
            entity_linking_taggers = parameters['entity_linking_taggers']

        entity_linking_taggers_document_language_dependent = {}
        if 'entity_linking_taggers_document_language_dependent' in parameters:
            entity_linking_taggers_document_language_dependent = parameters[
                'entity_linking_taggers_document_language_dependent']

        if 'language_s' in data:
            # is a language specific tagger there for the detected language?
            if data['language_s'] in entity_linking_taggers_document_language_dependent:
                for entity_linking_tagger in entity_linking_taggers_document_language_dependent[
                        data['language_s']]:
                    if not entity_linking_tagger in entity_linking_taggers:
                        entity_linking_taggers.append(entity_linking_tagger)

        openrefine_server = False
        if 'openrefine_server' in parameters:
            openrefine_server = parameters['openrefine_server']

        text = ''
        for field in data:

            values = data[field]

            if not isinstance(values, list):
                values = [values]

            for value in values:
                if value:
                    text = "{}{}\n".format(text, value)

        if openrefine_server:
            # use REST-API on (remote) HTTP server
            params = {'text': text}
            r = requests.post(openrefine_server, params=params)
            results = r.json()

        else:
            # use local Python library
            linker = Entity_Linker()
            linker.verbose = verbose

            results = linker.entities(text=text,
                                      taggers=entity_linking_taggers)

        if verbose:
            print("Named Entity Linking: {}".format(results))

        for match in results:
            for candidate in results[match]['result']:
                if candidate['match']:
                    for facet in candidate['type']:
                        etl.append(data, facet, candidate['name'])
                        etl.append(data, facet + '_uri_ss', candidate['id'])

        # mark the document, that it was analyzed by this plugin yet
        data['enhance_entity_linking_b'] = "true"

        return parameters, data
	def process (self, parameters={}, data={} ):
	
		verbose = False
		if 'verbose' in parameters:
			if parameters['verbose']:	
				verbose = True

		entity_linking_taggers = ['all_labels_ss_tag']
		if 'entity_linking_taggers' in parameters:
			entity_linking_taggers = parameters['entity_linking_taggers']

		# add taggers for stemming
		entity_linking_taggers_document_language_dependent = {}
		if 'entity_linking_taggers_document_language_dependent' in parameters:
			entity_linking_taggers_document_language_dependent = parameters['entity_linking_taggers_document_language_dependent']

		if 'language_s' in data:
			# is a language specific tagger there for the detected language?
			if data['language_s'] in entity_linking_taggers_document_language_dependent:
				for entity_linking_tagger in entity_linking_taggers_document_language_dependent[data['language_s']]:
					if not entity_linking_tagger in entity_linking_taggers:
						entity_linking_taggers.append(entity_linking_tagger)
		
		openrefine_server = False
		if 'openrefine_server' in parameters:
			openrefine_server = parameters['openrefine_server']

		taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss']

		# collect/copy to be analyzed text from all fields
		text = ''
		for field in data:
			
			values = data[field]

			if not isinstance(values, list):
				values = [values]
			
			for value in values:
				if value:
					text = "{}{}\n".format(text, value)

		# tag all entities (by different taggers for different analyzers/stemmers)
		for entity_linking_tagger in entity_linking_taggers:

			results = {}

			retries = 0
			retrytime = 1
			retrytime_max = 120 # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry
			no_connection = True
			
			while no_connection:
				try:
					if retries > 0:
						print('Retrying to connect to Solr tagger in {} second(s).'.format(retrytime))
						time.sleep(retrytime)
						retrytime = retrytime * 2
						if retrytime > retrytime_max:
							retrytime = retrytime_max
		
					# call REST API
					if openrefine_server:
						# use REST-API on (remote) HTTP server
						params = {'text': text}
						r = requests.post(openrefine_server, params=params)
						# if bad status code, raise exception
						r.raise_for_status()

						results = r.json()
						
					else:
						# use local Python library
						linker = Entity_Linker()
						linker.verbose = verbose
			
						results = linker.entities( text = text, taggers = [entity_linking_tagger], additional_result_fields = taxonomy_fields )
	
					no_connection = False
				
				except KeyboardInterrupt:
					raise KeyboardInterrupt
				
				except requests.exceptions.ConnectionError as e:
					
					retries += 1
					
					if openrefine_server:
						sys.stderr.write( "Connection to Openrefine server failed (will retry in {} seconds). Exception: {}\n".format(retrytime, e) )
					else:
						sys.stderr.write( "Connection to Solr text tagger failed (will retry in {} seconds). Exception: {}\n".format(retrytime, e) )
				
				except requests.exceptions.HTTPError as e:
					if e.response.status_code == 503:

						retries += 1
						
						if openrefine_server:
							sys.stderr.write( "Openrefine server temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n".format(retrytime, e) )
						else:
							sys.stderr.write( "Solr temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n".format(retrytime, e) )

					elif e.response.status_code == 400:
						no_connection = False

						# if error because of empty entity index for that tagger because no entities imported yet, no error message / index as fail
						empty_entity_index = False
						try:
							errorstatus = e.response.json()
							if errorstatus['error']['msg'] == 'field ' + entity_linking_tagger + ' has no indexed data':
								empty_entity_index = True
						except:
							pass
						
						if not empty_entity_index:
							etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)

					else:
						no_connection = False
						etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)

				except BaseException as e:
					no_connection = False
					etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)

			if verbose:
				print ("Named Entity Linking by Tagger {}: {}".format(entity_linking_tagger, results))
	
	
			# write entities from result to document facets
			for match in results:
				for candidate in results[match]['result']:
					if candidate['match']:
						for facet in candidate['type']:

							# use different facet for fuzzy/stemmed matches
							if not entity_linking_tagger == 'all_labels_ss_tag':
								# do not use another different facet if same stemmer but forced / not document language dependent
								entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace('_stemming_force_', '_stemming_')
								facet = facet + entity_linking_tagger_withoutforceoption + '_ss'
							
							etl.append(data, facet, candidate['name'])
							etl.append(data, facet + '_uri_ss', candidate['id'])
							etl.append(data, facet + '_preflabel_and_uri_ss', candidate['name'] + ' <' + candidate['id'] + '>')

							if 'matchtext' in candidate:
								for matchtext in candidate['matchtext']:
									etl.append(data, facet + '_matchtext_ss', candidate['id'] + "\t" + matchtext)
							
							for taxonomy_field in taxonomy_fields:
								if taxonomy_field in candidate:
									separated_taxonomy_fields = taxonomy2fields(field=facet, data=candidate[taxonomy_field])
									for separated_taxonomy_field in separated_taxonomy_fields:
										etl.append(data, separated_taxonomy_field, separated_taxonomy_fields[separated_taxonomy_field])
	

		# mark the document, that it was analyzed by this plugin yet
		data['etl_enhance_entity_linking_b'] = "true"
		
		return parameters, data