Exemple #1
0
    def process(self, parameters=None, data=None):
        if parameters is None:
            parameters = {}
        if data is None:
            data = {}

        # todo: use all data fields for analysis
        text = ''
        if 'content_txt' in data:
            text = data['content_txt']

        for match in re.finditer('[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]', text, re.IGNORECASE):
            value = match.group(0)
            etl.append(data, 'phone_ss', value)


        # if extracted phone number(s), normalize to format that can be used for aggregation/filters

        if 'phone_ss' in data:

            phones = data['phone_ss']
            if not isinstance(phones, list):
                phones = [phones]

            for phone in phones:
                phone_normalized = normalize_phonenumber(phone)
                etl.append(data, 'phone_normalized_ss', phone_normalized)

        return parameters, data
Exemple #2
0
    def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        if 'enhance_file_meta_filename' in parameters:
            meta_json_file = parameters['enhance_file_meta_filename']
        else:
            if verbose:
                print(
                    'enhance_file_meta_filename not defined in config, please add config[\'enhance_file_meta_filename\'] = \'meta.json\''
                )
            return parameters, data

        id = parameters['id']
        if 'container' in parameters:
            id = parameters['container']
        id = id.replace('file://', '', 1)
        directory = os.path.dirname(id)
        metafile = directory + '/' + meta_json_file
        meta = {}

        if os.path.isfile(metafile):
            if zipfile.is_zipfile(metafile):
                if verbose:
                    print('is zipfile:' + metafile)
                with ZipFile(metafile) as myzip:
                    fn = myzip.namelist()
                    with myzip.open(fn[0]) as myfile:
                        tmp = myfile.read()
            else:
                if metafile.endswith('.xz'):
                    if verbose:
                        print('is XZ file:' + metafile)
                    tmp = lzma.open(metafile, mode='rt',
                                    encoding='utf-8').read()
                else:
                    if verbose:
                        print('not zip nor xz:' + metafile)
                    tmp = open(metafile).read()
            try:
                meta = json.loads(tmp)
            except Exception as e:
                print('Exception loading ' + metafile + ' error' + e)
            else:
                #data.update(meta)
                for k in meta:
                    etl.append(data, k, meta[k])
                if verbose:
                    print('meta file:' + metafile)
                    print('meta:' + json.dumps(meta))
        else:
            if verbose:
                print('file does not exist ' + metafile)

        return parameters, data
    def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        openrefine_server = False

        text = ''
        for field in data:

            values = data[field]

            if not isinstance(values, list):
                values = [values]

            for value in values:
                if value:
                    text = "{}{}\n".format(text, value)

        if openrefine_server:
            # use REST-API on (remote) HTTP server
            params = {'text': text}
            r = requests.post(openrefine_server, params=params)
            results = r.json()

        else:
            # use local Python library
            linker = Entity_Linker()
            linker.verbose = verbose

            results = linker.entities(text=text)

        if verbose:
            print("Named Entity Linking: {}".format(results))

        for match in results:
            for candidate in results[match]['result']:
                if candidate['match']:
                    for facet in candidate['type']:
                        etl.append(data, facet, candidate['name'])
                        etl.append(data, facet + '_uri_ss', candidate['id'])

        # mark the document, that it was analyzed by this plugin yet
        data['enhance_entity_linking_b'] = "true"

        return parameters, data
    def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        openrefine_server = False

        analyse_fields = [
            'title_txt', 'content_txt', 'description_txt', 'ocr_t',
            'ocr_descew_t'
        ]

        text = ''
        for field in analyse_fields:
            if field in data:
                text = "{}{}\n".format(text, data[field])

        if openrefine_server:
            # use REST-API on (remote) HTTP server
            params = {'text': text}
            r = requests.post(openrefine_server, params=params)
            results = r.json()

        else:
            # use local Python library
            linker = Entity_Linker()
            linker.verbose = verbose

            results = linker.entities(text=text)

        if verbose:
            print("Named Entity Linking: {}".format(results))

        for match in results:
            for candidate in results[match]['result']:

                for facet in candidate['type']:
                    etl.append(data, facet, candidate['name'])
                    etl.append(data, facet + '_uri_ss', candidate['id'])

        # mark the document, that it was analyzed by this plugin yet
        data['enhance_entity_linking_b'] = "true"

        return parameters, data
def regex2facet(data, text, regex, group, facet, verbose=False):

    if verbose:
        print("Checking regex {} for facet {}".format(regex, facet))

    matches = re.finditer(regex, text, re.IGNORECASE)

    if matches:
        for match in matches:

            try:
                value = match.group(group)
                if verbose:
                    print("Found regex {} with value {} for facet {}".format(
                        regex, value, facet))

                etl.append(data, facet, value)

            except BaseException as e:
                print(
                    "Exception while adding value {} from regex {} and group {} to facet {}:"
                    .format(value, regex, group, facet))
                print(e.args[0])
    def process(self, parameters=None, data=None):
        if parameters is None:
            parameters = {}
        if data is None:
            data = {}

        # todo: use all data fields for analysis
        text = ''
        if 'content_txt' in data:
            text = data['content_txt']

        for match in re.finditer('[\w\.-]+@[\w\.-]+', text, re.IGNORECASE):
            value = match.group(0)
            etl.append(data, 'email_ss', value)


        # if extracted email addresses from data, do further analysis for separated specialized facets
        if 'email_ss' in data:

            # extract email adresses of sender (from)
            for match in re.finditer('From: (.* )?([\w\.-]+@[\w\.-]+)', text, re.IGNORECASE):
                value = match.group(2)
                etl.append(data, 'Message-From_ss', value)

            # extract email adresses (to)
            for match in re.finditer('To: (.* )?([\w\.-]+@[\w\.-]+)', text, re.IGNORECASE):
                value = match.group(2)
                etl.append(data, 'Message-To_ss', value)

            # extract the domain part from all emailadresses to facet email domains
            data['email_domain_ss'] = []
            emails = data['email_ss']
            if not isinstance(emails, list):
                emails = [emails]

            for email in emails:
                domain = email.split('@')[1]
                etl.append(data, 'email_domain_ss', domain)

        return parameters, data
Exemple #7
0
	def etl_graph(self, parameters):

		if self.verbose:
			print("Graph has {} triples.".format(len(self.graph)) )
	
		count_triple = 0
		count_subjects = 0
	
		part_parameters = {}
		part_parameters['plugins'] = []
		part_parameters['export'] = parameters['export']
						
		property2facet = {}
		if 'property2facet' in parameters:
			property2facet = parameters['property2facet']

		etl_processor = ETL()
		etl_processor.verbose = self.verbose
		
		class_properties = []
		class_properties.append(rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'))
		class_properties.append(rdflib.term.URIRef(u'http://www.wikidata.org/prop/direct/P31'))
		# since there can be multiple triples/values for same property,
		# do not overwrite document but add value to existent document & values of the facet/field/property
		part_parameters['add'] = True
		# but not for the field content_type which doesn't change and is not multi valued
		part_parameters['fields_set'] = "content_type"

		# use SPARQL query with distinct to get subjects only once
		res = self.graph.query(
			"""SELECT DISTINCT ?subject
			WHERE {
			?subject ?predicate ?object .
			}""")
	
		for row in res:

			count_subjects += 1
	
			if self.verbose:
				print( "Importing entity / subject {}".format(count_subjects) )

			# get subject of the concept from first column
			subj = row[0]

			if self.verbose:
				print ( "Processing RDF subject {}".format(subj) )

			part_data = {}
			part_data['content_type'] = 'Knowledge graph'
			part_data['content_type_group'] = 'Knowledge graph'
			# subject as URI/ID
			part_parameters['id'] = subj
			
			preferred_label = self.get_preferred_label(subject=subj)
			part_data['title'] = preferred_label
			
			count_subject_triple = 0

			# get all triples for this subject
			for pred, obj in self.graph.predicate_objects(subject=subj):

				count_triple += 1
				count_subject_triple += 1

				if self.verbose:
					print( "Importing subjects triple {}".format(count_subject_triple) )
					print( "Predicate / property: {}".format(pred) )
					print( "Object / value: {}".format(obj) )


				try:
					
					# if class add preferredlabel of this entity to facet of its class (RDF rdf:type or Wikidata "instance of" (Property:P31)),
					# so its name (label) will be available in entities view and as filter for faceted search
					
					if pred in class_properties:
						class_facet = str(obj)
						# map class to facet, if mapping for class exist
						if class_facet in property2facet:
							class_facet = property2facet[class_facet]
						etl.append(data=part_data, facet=class_facet, values=preferred_label)			

					#
					# Predicate/property to facet/field
					#

					# set Solr datatype strings so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype
					
					facet = pred + '_ss'
					facet_uri = facet + '_uri_ss'
					facet_preferred_label_and_uri = facet + '_preflabel_and_uri_ss'
					
					if self.verbose:
						print ( "Facet: {}".format(facet) )

	
					#
					# get values or labels of this object
					#

					values = self.get_values(obj=obj)
					if self.verbose:
						print ( "Values: {}".format(values) )

					# insert or append value (object of triple) to data
					etl.append(data=part_data, facet=facet, values=values)
					

					# if object is reference/URI append URI
					if type(obj) == rdflib.URIRef:
						
						uri = obj
						
						etl.append( data=part_data, facet=facet_uri, values=uri )

						# append mixed field with preferred label and URI of the object for disambiguation of different Entities/IDs/URIs with same names/labels in faceted search
						preferredlabel_and_uri = "{} <{}>".format ( self.get_preferred_label(subject=obj), obj)

					else:
						preferredlabel_and_uri = self.get_preferred_label(subject=obj)
					
					etl.append(data=part_data, facet=facet_preferred_label_and_uri, values=preferredlabel_and_uri)


				except KeyboardInterrupt:
					raise KeyboardInterrupt
	
				except BaseException as e:
					sys.stderr.write( "Exception while triple {} of subject {}: {}\n".format(count_subject_triple, subj, e) )
	
	
			# index triple
			etl_processor.process( part_parameters, part_data)
Exemple #8
0
    def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        entity_linking_taggers = ['all_labels_ss_tag']
        if 'entity_linking_taggers' in parameters:
            entity_linking_taggers = parameters['entity_linking_taggers']

        # add taggers for stemming
        entity_linking_taggers_document_language_dependent = {}
        if 'entity_linking_taggers_document_language_dependent' in parameters:
            entity_linking_taggers_document_language_dependent = parameters[
                'entity_linking_taggers_document_language_dependent']

        if 'language_s' in data:
            # is a language specific tagger there for the detected language?
            if data['language_s'] in entity_linking_taggers_document_language_dependent:
                for entity_linking_tagger in entity_linking_taggers_document_language_dependent[
                        data['language_s']]:
                    if not entity_linking_tagger in entity_linking_taggers:
                        entity_linking_taggers.append(entity_linking_tagger)

        openrefine_server = False
        if 'openrefine_server' in parameters:
            openrefine_server = parameters['openrefine_server']

        taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss']

        # collect/copy to be analyzed text from all fields
        text = ''
        for field in data:

            values = data[field]

            if not isinstance(values, list):
                values = [values]

            for value in values:
                if value:
                    text = "{}{}\n".format(text, value)

        # tag all entities (by different taggers for different analyzers/stemmers)
        for entity_linking_tagger in entity_linking_taggers:

            # call REST API
            if openrefine_server:
                # use REST-API on (remote) HTTP server
                params = {'text': text}
                r = requests.post(openrefine_server, params=params)
                results = r.json()

            else:
                # use local Python library
                linker = Entity_Linker()
                linker.verbose = verbose

                results = linker.entities(
                    text=text,
                    taggers=[entity_linking_tagger],
                    additional_result_fields=taxonomy_fields)

            if verbose:
                print("Named Entity Linking by Tagger {}: {}".format(
                    entity_linking_tagger, results))

            # write entities from result to document facets
            for match in results:
                for candidate in results[match]['result']:
                    if candidate['match']:
                        for facet in candidate['type']:

                            # use different facet for fuzzy/stemmed matches
                            if not entity_linking_tagger == 'all_labels_ss_tag':
                                # do not use another different facet if same stemmer but forced / not document language dependent
                                entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace(
                                    '_stemming_force_', '_stemming_')
                                facet = facet + entity_linking_tagger_withoutforceoption + '_ss'

                            etl.append(data, facet, candidate['name'])
                            etl.append(data, facet + '_uri_ss',
                                       candidate['id'])
                            etl.append(
                                data, facet + '_preflabel_and_uri_ss',
                                candidate['name'] + ' <' + candidate['id'] +
                                '>')

                            if 'matchtext' in candidate:
                                for matchtext in candidate['matchtext']:
                                    etl.append(
                                        data, facet + '_matchtext_ss',
                                        candidate['id'] + "\t" + matchtext)

                            for taxonomy_field in taxonomy_fields:
                                if taxonomy_field in candidate:
                                    separated_taxonomy_fields = taxonomy2fields(
                                        field=facet,
                                        data=candidate[taxonomy_field])
                                    for separated_taxonomy_field in separated_taxonomy_fields:
                                        etl.append(
                                            data, separated_taxonomy_field,
                                            separated_taxonomy_fields[
                                                separated_taxonomy_field])

        # mark the document, that it was analyzed by this plugin yet
        data['etl_enhance_entity_linking_b'] = "true"

        return parameters, data
    def process(self, parameters=None, data=None):
        if parameters is None:
            parameters = {}
        if data is None:
            data = {}

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        if 'spacy_ner_mapping' in parameters:
            mapping = parameters['spacy_ner_mapping']
        else:
            mapping = {
                'ORG': 'organization_ss',
                'NORP': 'organization_ss',
                'PER': 'person_ss',
                'PERSON': 'person_ss',
                'GPE': 'location_ss',
                'LOC': 'location_ss',
                'FACILITY': 'location_ss',
                'PRODUCT': 'product_ss',
                'EVENT': 'event_ss',
                'LAW': 'law_ss',
                'DATE': 'date_ss',
                'TIME': 'time_ss',
                'MONEY': 'money_ss',
                'WORK_OF_ART': 'work_of_art_ss',
            }

        # default classifier
        classifier = 'en_core_web_sm'

        if 'spacy_ner_classifier_default' in parameters:
            classifier = parameters['spacy_ner_classifier_default']

        # set language specific classifier, if configured and document language detected
        if 'spacy_ner_classifiers' in parameters and 'language_s' in data:
            # is a language speciic cassifier there for the detected language?
            if data['language_s'] in parameters['spacy_ner_classifiers']:
                classifier = parameters['spacy_ner_classifiers'][
                    data['language_s']]

        # if standard classifier configured to None and no classifier for detected language, exit the plugin
        if not classifier:

            return parameters, data

        if verbose:
            print(
                "Using SpaCY NER language / classifier: {}".format(classifier))

        analyse_fields = [
            'title_txt', 'content_txt', 'description_txt', 'ocr_t',
            'ocr_descew_t'
        ]

        text = ''
        for field in analyse_fields:
            if field in data:
                text = "{}{}\n".format(text, data[field])

        # classify/tag with class each word of the content

        url = "http://localhost:8080/ent"
        if os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER'):
            url = os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER') + '/ent'

        headers = {'content-type': 'application/json'}
        d = {'text': text, 'model': classifier}

        retries = 0
        retrytime = 1
        # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry
        retrytime_max = 120
        no_connection = True

        while no_connection:
            try:
                if retries > 0:
                    print(
                        'Retrying to connect to Spacy services in {} second(s).'
                        .format(retrytime))
                    time.sleep(retrytime)
                    retrytime = retrytime * 2
                    if retrytime > retrytime_max:
                        retrytime = retrytime_max

                response = requests.post(url,
                                         data=json.dumps(d),
                                         headers=headers)
                # if bad status code, raise exception
                response.raise_for_status()

                no_connection = False

            except requests.exceptions.ConnectionError as e:
                retries += 1
                sys.stderr.write(
                    "Connection to Spacy services (will retry in {} seconds) failed. Exception: {}\n"
                    .format(retrytime, e))

        r = response.json()

        for ent in r:

            entity_class = ent['label']
            # get entity string from returned start and end value
            entity = text[int(ent['start']):int(ent['end'])]

            # strip whitespaces from begin and end
            entity = entity.strip()

            # after strip exclude empty entities
            if not entity:
                continue

            # if class of entity is mapped to a facet/field, append the entity to this facet/field

            if entity_class in mapping:

                if verbose:
                    print(
                        "NER classified word(s)/name {} to {}. Appending to mapped facet {}"
                        .format(entity, entity_class, mapping[entity_class]))

                etl.append(data, mapping[entity_class], entity)

            else:
                if verbose:
                    print(
                        "Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}"
                        .format(entity_class, entity))

        return parameters, data
	def process (self, parameters={}, data={} ):
	
		verbose = False
		if 'verbose' in parameters:
			if parameters['verbose']:	
				verbose = True


		if 'stanford_ner_mapping' in parameters:
			mapping = parameters['stanford_ner_mapping']
		else:
			# todo: extend mapping for models with more classes like dates
			mapping = {
			 'PERSON': 'person_ss',
			 'LOCATION': 'location_ss',
			 'ORGANIZATION': 'organization_ss',
			 'I-ORG': 'organization_ss',
			 'I-PER': 'person_ss',
			 'I-LOC': 'location_ss',
			 'ORG': 'organization_ss',
			 'PER': 'person_ss',
			 'LOC': 'location_ss',
			 'PERS': 'person_ss',
			 'LUG': 'location_ss',
			 'MONEY': 'money_ss',
			}
	

		# default classifier
		classifier = 'english.all.3class.distsim.crf.ser.gz'

		if 'stanford_ner_classifier_default' in parameters:
			classifier = parameters['stanford_ner_classifier_default']

		# set language specific classifier, if configured and document language detected
		if 'stanford_ner_classifiers' in parameters and 'language_s' in data:
			# is a language speciic cassifier there for the detected language?
			if data['language_s'] in parameters['stanford_ner_classifiers']:
				classifier = parameters['stanford_ner_classifiers'][data['language_s']]

		# if standard classifier configured to None and no classifier for detected language, exit the plugin
		if not classifier:
			return parameters, data

		kwargs={}

		if 'stanford_ner_java_options' in parameters:
			kwargs['java_options'] = parameters['stanford_ner_java_options']

		if 'stanford_ner_path_to_jar' in parameters:
			kwargs['path_to_jar'] = parameters['stanford_ner_path_to_jar']

		analyse_fields = ['title_txt','content_txt','description_txt','ocr_t','ocr_descew_t']

		text = ''
		for field in analyse_fields:
			if field in data:
				text = "{}{}\n".format(text, data[field])

		# classify/tag with class each word of the content
		st = StanfordNERTagger(classifier, encoding='utf8', verbose=verbose, **kwargs)
		entities = st.tag(text.split())

		# compound words of same class to multi word entities (result is a split by class changes instead of split on single words/tokens)
		entities = self.multi_word_entities(entities)

		# if class of entity is mapped to a facet/field, append the entity to this facet/field
		for entity, entity_class in entities:

			if entity_class in mapping:
				
				if verbose:
					print ( "NER classified word(s)/name {} to {}. Appending to mapped facet {}".format(entity, entity_class, mapping[entity_class]) )

				etl.append(data, mapping[entity_class], entity)

			else:
				if verbose:
					print ( "Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}".format(entity_class, entity) )


		# mark the document, that it was analyzed by this plugin yet
		data['enhance_ner_stanford_b'] = "true"
		
		return parameters, data
    def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        if 'spacy_ner_spacy' in parameters:
            mapping = parameters['spacy_ner_spacy']
        else:
            mapping = {
                'ORG': 'organization_ss',
                'NORP': 'organization_ss',
                'PER': 'person_ss',
                'PERSON': 'person_ss',
                'GPE': 'location_ss',
                'LOC': 'location_ss',
                'FACILITY': 'location_ss',
                'PRODUCT': 'product_ss',
                'EVENT': 'event_ss',
                'LAW': 'law_ss',
                'DATE': 'date_ss',
                'TIME': 'time_ss',
                'MONEY': 'money_ss',
                'WORK_OF_ART': 'work_of_art_ss',
            }

        # default classifier
        classifier = 'en'

        if 'spacy_ner_classifier_default' in parameters:
            classifier = parameters['spacy_ner_classifier_default']

        # set language specific classifier, if configured and document language detected
        if 'spacy_ner_classifiers' in parameters and 'language_s' in data:
            # is a language speciic cassifier there for the detected language?
            if data['language_s'] in parameters['spacy_ner_classifiers']:
                classifier = parameters['spacy_ner_classifiers'][
                    data['language_s']]

        # if standard classifier configured to None and no classifier for detected language, exit the plugin
        if not classifier:

            return parameters, data

        if verbose:
            print(
                "Using SpaCY NER language / classifier: {}".format(classifier))

        analyse_fields = [
            'title', 'content', 'description', 'ocr_t', 'ocr_descew_t'
        ]

        text = ''
        for field in analyse_fields:
            if field in data:
                text = "{}{}\n".format(text, data[field])

        # classify/tag with class each word of the content

        url = os.getenv('OSS_SPACY_URL', default='http://localhost:8000/ent')
        headers = {'content-type': 'application/json'}
        d = {'text': text, 'model': classifier}

        response = requests.post(url, data=json.dumps(d), headers=headers)
        r = response.json()

        for ent in r:

            entity_class = ent['type']
            # get entity string from returned start and end value
            entity = text[int(ent['start']):int(ent['end'])]

            # strip whitespaces from begin and end
            entity = entity.strip()

            # after strip exclude empty entities
            if not entity:
                continue

            # if class of entity is mapped to a facet/field, append the entity to this facet/field

            if entity_class in mapping:

                if verbose:
                    print(
                        "NER classified word(s)/name {} to {}. Appending to mapped facet {}"
                        .format(entity, entity_class, mapping[entity_class]))

                etl.append(data, mapping[entity_class], entity)

            else:
                if verbose:
                    print(
                        "Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}"
                        .format(entity_class, entity))

        # mark the document, that it was analyzed by this plugin yet
        data['enhance_ner_spacy_b'] = "true"

        return parameters, data
    def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        if 'stanford_ner_mapping' in parameters:
            mapping = parameters['stanford_ner_mapping']
        else:
            # todo: extend mapping for models with more classes like dates
            mapping = {
                'PERSON': 'person_ss',
                'LOCATION': 'location_ss',
                'ORGANIZATION': 'organization_ss',
                'I-ORG': 'organization_ss',
                'I-PER': 'person_ss',
                'I-LOC': 'location_ss',
                'ORG': 'organization_ss',
                'PER': 'person_ss',
                'LOC': 'location_ss',
                'PERS': 'person_ss',
                'LUG': 'location_ss',
                'MONEY': 'money_ss',
            }

        # default classifier
        classifier = 'english.all.3class.distsim.crf.ser.gz'

        if 'stanford_ner_classifier_default' in parameters:
            classifier = parameters['stanford_ner_classifier_default']

        # set language specific classifier, if configured and document language detected
        if 'stanford_ner_classifiers' in parameters and 'language_s' in data:
            # is a language speciic cassifier there for the detected language?
            if data['language_s'] in parameters['stanford_ner_classifiers']:
                classifier = parameters['stanford_ner_classifiers'][
                    data['language_s']]

        # if standard classifier configured to None and no classifier for detected language, exit the plugin
        if not classifier:
            return parameters, data

        kwargs = {}

        if 'stanford_ner_java_options' in parameters:
            kwargs['java_options'] = parameters['stanford_ner_java_options']

        if 'stanford_ner_path_to_jar' in parameters:
            kwargs['path_to_jar'] = parameters['stanford_ner_path_to_jar']

        analyse_fields = [
            'title_txt', 'content_txt', 'description_txt', 'ocr_t',
            'ocr_descew_t'
        ]

        text = ''
        for field in analyse_fields:
            if field in data:
                text = "{}{}\n".format(text, data[field])

        # classify/tag with class each word of the content
        st = StanfordNERTagger(classifier,
                               encoding='utf8',
                               verbose=verbose,
                               **kwargs)
        entities = st.tag(text.split())

        # compound words of same class to multi word entities (result is a split by class changes instead of split on single words/tokens)
        entities = self.multi_word_entities(entities)

        # if class of entity is mapped to a facet/field, append the entity to this facet/field
        for entity, entity_class in entities:

            if entity_class in mapping:

                if verbose:
                    print(
                        "NER classified word(s)/name {} to {}. Appending to mapped facet {}"
                        .format(entity, entity_class, mapping[entity_class]))

                etl.append(data, mapping[entity_class], entity)

            else:
                if verbose:
                    print(
                        "Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}"
                        .format(entity_class, entity))

        # mark the document, that it was analyzed by this plugin yet
        data['enhance_ner_stanford_b'] = "true"

        return parameters, data
Exemple #13
0
    def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        entity_linking_taggers = ['all_labels_ss_tag']
        if 'entity_linking_taggers' in parameters:
            entity_linking_taggers = parameters['entity_linking_taggers']

        # add taggers for stemming
        entity_linking_taggers_document_language_dependent = {}
        if 'entity_linking_taggers_document_language_dependent' in parameters:
            entity_linking_taggers_document_language_dependent = parameters[
                'entity_linking_taggers_document_language_dependent']

        if 'language_s' in data:
            # is a language specific tagger there for the detected language?
            if data['language_s'] in entity_linking_taggers_document_language_dependent:
                for entity_linking_tagger in entity_linking_taggers_document_language_dependent[
                        data['language_s']]:
                    if not entity_linking_tagger in entity_linking_taggers:
                        entity_linking_taggers.append(entity_linking_tagger)

        openrefine_server = False
        if 'openrefine_server' in parameters:
            openrefine_server = parameters['openrefine_server']

        taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss']

        # collect/copy to be analyzed text from all fields
        text = ''
        for field in data:

            values = data[field]

            if not isinstance(values, list):
                values = [values]

            for value in values:
                if value:
                    text = "{}{}\n".format(text, value)

        # tag all entities (by different taggers for different analyzers/stemmers)
        for entity_linking_tagger in entity_linking_taggers:

            results = {}

            retries = 0
            retrytime = 1
            retrytime_max = 120  # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry
            no_connection = True

            while no_connection:
                try:
                    if retries > 0:
                        print(
                            'Retrying to connect to Solr tagger in {} second(s).'
                            .format(retrytime))
                        time.sleep(retrytime)
                        retrytime = retrytime * 2
                        if retrytime > retrytime_max:
                            retrytime = retrytime_max

                    # call REST API
                    if openrefine_server:
                        # use REST-API on (remote) HTTP server
                        params = {'text': text}
                        r = requests.post(openrefine_server, params=params)
                        # if bad status code, raise exception
                        r.raise_for_status()

                        results = r.json()

                    else:
                        # use local Python library
                        linker = Entity_Linker()
                        linker.verbose = verbose

                        results = linker.entities(
                            text=text,
                            taggers=[entity_linking_tagger],
                            additional_result_fields=taxonomy_fields)

                    no_connection = False

                except KeyboardInterrupt:
                    raise KeyboardInterrupt

                except requests.exceptions.ConnectionError as e:

                    retries += 1

                    if openrefine_server:
                        sys.stderr.write(
                            "Connection to Openrefine server failed (will retry in {} seconds). Exception: {}\n"
                            .format(retrytime, e))
                    else:
                        sys.stderr.write(
                            "Connection to Solr text tagger failed (will retry in {} seconds). Exception: {}\n"
                            .format(retrytime, e))

                except requests.exceptions.HTTPError as e:
                    if e.response.status_code == 503:

                        retries += 1

                        if openrefine_server:
                            sys.stderr.write(
                                "Openrefine server temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n"
                                .format(retrytime, e))
                        else:
                            sys.stderr.write(
                                "Solr temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n"
                                .format(retrytime, e))

                    elif e.response.status_code == 400:
                        no_connection = False

                        # if error because of empty entity index for that tagger because no entities imported yet, no error message / index as fail
                        empty_entity_index = False
                        try:
                            errorstatus = e.response.json()
                            if errorstatus['error'][
                                    'msg'] == 'field ' + entity_linking_tagger + ' has no indexed data':
                                empty_entity_index = True
                        except:
                            pass

                        if not empty_entity_index:
                            etl.error_message(docid=parameters['id'],
                                              data=data,
                                              plugin='enhance_entity_linking',
                                              e=e)

                    else:
                        no_connection = False
                        etl.error_message(docid=parameters['id'],
                                          data=data,
                                          plugin='enhance_entity_linking',
                                          e=e)

                except BaseException as e:
                    no_connection = False
                    etl.error_message(docid=parameters['id'],
                                      data=data,
                                      plugin='enhance_entity_linking',
                                      e=e)

            if verbose:
                print("Named Entity Linking by Tagger {}: {}".format(
                    entity_linking_tagger, results))

            # write entities from result to document facets
            for match in results:
                for candidate in results[match]['result']:
                    if candidate['match']:
                        for facet in candidate['type']:

                            # use different facet for fuzzy/stemmed matches
                            if not entity_linking_tagger == 'all_labels_ss_tag':
                                # do not use another different facet if same stemmer but forced / not document language dependent
                                entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace(
                                    '_stemming_force_', '_stemming_')
                                facet = facet + entity_linking_tagger_withoutforceoption + '_ss'

                            etl.append(data, facet, candidate['name'])
                            etl.append(data, facet + '_uri_ss',
                                       candidate['id'])
                            etl.append(
                                data, facet + '_preflabel_and_uri_ss',
                                candidate['name'] + ' <' + candidate['id'] +
                                '>')

                            if 'matchtext' in candidate:
                                for matchtext in candidate['matchtext']:
                                    etl.append(
                                        data, facet + '_matchtext_ss',
                                        candidate['id'] + "\t" + matchtext)

                            for taxonomy_field in taxonomy_fields:
                                if taxonomy_field in candidate:
                                    separated_taxonomy_fields = taxonomy2fields(
                                        field=facet,
                                        data=candidate[taxonomy_field])
                                    for separated_taxonomy_field in separated_taxonomy_fields:
                                        etl.append(
                                            data, separated_taxonomy_field,
                                            separated_taxonomy_fields[
                                                separated_taxonomy_field])

        # mark the document, that it was analyzed by this plugin yet
        data['etl_enhance_entity_linking_b'] = "true"

        return parameters, data
    def process(self, parameters={}, data={}):

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        entity_linking_taggers = ['all_labels_ss_tag']
        if 'entity_linking_taggers' in parameters:
            entity_linking_taggers = parameters['entity_linking_taggers']

        entity_linking_taggers_document_language_dependent = {}
        if 'entity_linking_taggers_document_language_dependent' in parameters:
            entity_linking_taggers_document_language_dependent = parameters[
                'entity_linking_taggers_document_language_dependent']

        if 'language_s' in data:
            # is a language specific tagger there for the detected language?
            if data['language_s'] in entity_linking_taggers_document_language_dependent:
                for entity_linking_tagger in entity_linking_taggers_document_language_dependent[
                        data['language_s']]:
                    if not entity_linking_tagger in entity_linking_taggers:
                        entity_linking_taggers.append(entity_linking_tagger)

        openrefine_server = False
        if 'openrefine_server' in parameters:
            openrefine_server = parameters['openrefine_server']

        text = ''
        for field in data:

            values = data[field]

            if not isinstance(values, list):
                values = [values]

            for value in values:
                if value:
                    text = "{}{}\n".format(text, value)

        if openrefine_server:
            # use REST-API on (remote) HTTP server
            params = {'text': text}
            r = requests.post(openrefine_server, params=params)
            results = r.json()

        else:
            # use local Python library
            linker = Entity_Linker()
            linker.verbose = verbose

            results = linker.entities(text=text,
                                      taggers=entity_linking_taggers)

        if verbose:
            print("Named Entity Linking: {}".format(results))

        for match in results:
            for candidate in results[match]['result']:
                if candidate['match']:
                    for facet in candidate['type']:
                        etl.append(data, facet, candidate['name'])
                        etl.append(data, facet + '_uri_ss', candidate['id'])

        # mark the document, that it was analyzed by this plugin yet
        data['enhance_entity_linking_b'] = "true"

        return parameters, data
	def process (self, parameters={}, data={} ):
	
		verbose = False
		if 'verbose' in parameters:
			if parameters['verbose']:	
				verbose = True

		entity_linking_taggers = ['all_labels_ss_tag']
		if 'entity_linking_taggers' in parameters:
			entity_linking_taggers = parameters['entity_linking_taggers']

		# add taggers for stemming
		entity_linking_taggers_document_language_dependent = {}
		if 'entity_linking_taggers_document_language_dependent' in parameters:
			entity_linking_taggers_document_language_dependent = parameters['entity_linking_taggers_document_language_dependent']

		if 'language_s' in data:
			# is a language specific tagger there for the detected language?
			if data['language_s'] in entity_linking_taggers_document_language_dependent:
				for entity_linking_tagger in entity_linking_taggers_document_language_dependent[data['language_s']]:
					if not entity_linking_tagger in entity_linking_taggers:
						entity_linking_taggers.append(entity_linking_tagger)
		
		openrefine_server = False
		if 'openrefine_server' in parameters:
			openrefine_server = parameters['openrefine_server']

		taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss']

		# collect/copy to be analyzed text from all fields
		text = ''
		for field in data:
			
			values = data[field]

			if not isinstance(values, list):
				values = [values]
			
			for value in values:
				if value:
					text = "{}{}\n".format(text, value)

		# tag all entities (by different taggers for different analyzers/stemmers)
		for entity_linking_tagger in entity_linking_taggers:

			results = {}

			retries = 0
			retrytime = 1
			retrytime_max = 120 # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry
			no_connection = True
			
			while no_connection:
				try:
					if retries > 0:
						print('Retrying to connect to Solr tagger in {} second(s).'.format(retrytime))
						time.sleep(retrytime)
						retrytime = retrytime * 2
						if retrytime > retrytime_max:
							retrytime = retrytime_max
		
					# call REST API
					if openrefine_server:
						# use REST-API on (remote) HTTP server
						params = {'text': text}
						r = requests.post(openrefine_server, params=params)
						# if bad status code, raise exception
						r.raise_for_status()

						results = r.json()
						
					else:
						# use local Python library
						linker = Entity_Linker()
						linker.verbose = verbose
			
						results = linker.entities( text = text, taggers = [entity_linking_tagger], additional_result_fields = taxonomy_fields )
	
					no_connection = False
				
				except KeyboardInterrupt:
					raise KeyboardInterrupt
				
				except requests.exceptions.ConnectionError as e:
					
					retries += 1
					
					if openrefine_server:
						sys.stderr.write( "Connection to Openrefine server failed (will retry in {} seconds). Exception: {}\n".format(retrytime, e) )
					else:
						sys.stderr.write( "Connection to Solr text tagger failed (will retry in {} seconds). Exception: {}\n".format(retrytime, e) )
				
				except requests.exceptions.HTTPError as e:
					if e.response.status_code == 503:

						retries += 1
						
						if openrefine_server:
							sys.stderr.write( "Openrefine server temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n".format(retrytime, e) )
						else:
							sys.stderr.write( "Solr temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n".format(retrytime, e) )

					elif e.response.status_code == 400:
						no_connection = False

						# if error because of empty entity index for that tagger because no entities imported yet, no error message / index as fail
						empty_entity_index = False
						try:
							errorstatus = e.response.json()
							if errorstatus['error']['msg'] == 'field ' + entity_linking_tagger + ' has no indexed data':
								empty_entity_index = True
						except:
							pass
						
						if not empty_entity_index:
							etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)

					else:
						no_connection = False
						etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)

				except BaseException as e:
					no_connection = False
					etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)

			if verbose:
				print ("Named Entity Linking by Tagger {}: {}".format(entity_linking_tagger, results))
	
	
			# write entities from result to document facets
			for match in results:
				for candidate in results[match]['result']:
					if candidate['match']:
						for facet in candidate['type']:

							# use different facet for fuzzy/stemmed matches
							if not entity_linking_tagger == 'all_labels_ss_tag':
								# do not use another different facet if same stemmer but forced / not document language dependent
								entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace('_stemming_force_', '_stemming_')
								facet = facet + entity_linking_tagger_withoutforceoption + '_ss'
							
							etl.append(data, facet, candidate['name'])
							etl.append(data, facet + '_uri_ss', candidate['id'])
							etl.append(data, facet + '_preflabel_and_uri_ss', candidate['name'] + ' <' + candidate['id'] + '>')

							if 'matchtext' in candidate:
								for matchtext in candidate['matchtext']:
									etl.append(data, facet + '_matchtext_ss', candidate['id'] + "\t" + matchtext)
							
							for taxonomy_field in taxonomy_fields:
								if taxonomy_field in candidate:
									separated_taxonomy_fields = taxonomy2fields(field=facet, data=candidate[taxonomy_field])
									for separated_taxonomy_field in separated_taxonomy_fields:
										etl.append(data, separated_taxonomy_field, separated_taxonomy_fields[separated_taxonomy_field])
	

		# mark the document, that it was analyzed by this plugin yet
		data['etl_enhance_entity_linking_b'] = "true"
		
		return parameters, data
	def etl_graph(self, parameters):

		if self.verbose:
			print("Graph has {} triples.".format(len(self.graph)) )
	
		count_triple = 0
		count_subjects = 0
	
		part_parameters = {}
		part_parameters['plugins'] = []
		part_parameters['export'] = parameters['export']
						
		property2facet = {}
		if 'property2facet' in parameters:
			property2facet = parameters['property2facet']

		etl_processor = ETL()
		etl_processor.verbose = self.verbose
		
		class_properties = []
		class_properties.append(rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'))
		class_properties.append(rdflib.term.URIRef(u'http://www.wikidata.org/prop/direct/P31'))
		# since there can be multiple triples/values for same property in/from different graphs or graph describes existing other file/document,
		# do not overwrite document but add value to existent document & values of the facet/field/property
		part_parameters['add'] = True

		# use SPARQL query with distinct to get subjects only once
		res = self.graph.query(
			"""SELECT DISTINCT ?subject
			WHERE {
			?subject ?predicate ?object .
			}""")
	
		for row in res:

			count_subjects += 1
	
			if self.verbose:
				print( "Importing entity / subject {}".format(count_subjects) )

			# get subject of the concept from first column
			subj = row[0]

			if self.verbose:
				print ( "Processing RDF subject {}".format(subj) )

			part_data = {}
			
			part_data['content_type_group_ss'] = 'Knowledge graph'
			# subject as URI/ID
			part_parameters['id'] = subj
			
			preferred_label = self.get_preferred_label(subject=subj)
			part_data['title_txt'] = preferred_label
			
			count_subject_triple = 0

			# get all triples for this subject
			for pred, obj in self.graph.predicate_objects(subject=subj):

				count_triple += 1
				count_subject_triple += 1

				if self.verbose:
					print( "Importing subjects triple {}".format(count_subject_triple) )
					print( "Predicate / property: {}".format(pred) )
					print( "Object / value: {}".format(obj) )


				try:
					
					# if class add preferredlabel of this entity to facet of its class (RDF rdf:type or Wikidata "instance of" (Property:P31)),
					# so its name (label) will be available in entities view and as filter for faceted search
					
					if pred in class_properties:
						class_facet = str(obj)
						# map class to facet, if mapping for class exist
						if class_facet in property2facet:
							class_facet = property2facet[class_facet]
							if class_facet in parameters['facets']:
								part_data['content_type_ss'] = 'Knowledge graph class {}'.format(parameters['facets'][class_facet]['label'])
						etl.append(data=part_data, facet=class_facet, values=preferred_label)


					#
					# Predicate/property to facet/field
					#

					# set Solr datatype strings so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype
					
					facet = pred + '_ss'
					facet_uri = facet + '_uri_ss'
					facet_preferred_label_and_uri = facet + '_preflabel_and_uri_ss'
					
					if self.verbose:
						print ( "Facet: {}".format(facet) )

	
					#
					# get values or labels of this object
					#

					values = self.get_values(obj=obj)
					if self.verbose:
						print ( "Values: {}".format(values) )

					# insert or append value (object of triple) to data
					etl.append(data=part_data, facet=facet, values=values)
					

					# if object is reference/URI append URI
					if type(obj) == rdflib.URIRef:
						
						uri = obj
						
						etl.append( data=part_data, facet=facet_uri, values=uri )

						# append mixed field with preferred label and URI of the object for disambiguation of different Entities/IDs/URIs with same names/labels in faceted search
						preferredlabel_and_uri = "{} <{}>".format ( self.get_preferred_label(subject=obj), obj)

					else:
						preferredlabel_and_uri = self.get_preferred_label(subject=obj)
					
					etl.append(data=part_data, facet=facet_preferred_label_and_uri, values=preferredlabel_and_uri)


				except KeyboardInterrupt:
					raise KeyboardInterrupt
	
				except BaseException as e:
					sys.stderr.write( "Exception while triple {} of subject {}: {}\n".format(count_subject_triple, subj, e) )
	
	
			# index triple
			etl_processor.process( part_parameters, part_data)