def process(self, parameters=None, data=None): if parameters is None: parameters = {} if data is None: data = {} # todo: use all data fields for analysis text = '' if 'content_txt' in data: text = data['content_txt'] for match in re.finditer('[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]', text, re.IGNORECASE): value = match.group(0) etl.append(data, 'phone_ss', value) # if extracted phone number(s), normalize to format that can be used for aggregation/filters if 'phone_ss' in data: phones = data['phone_ss'] if not isinstance(phones, list): phones = [phones] for phone in phones: phone_normalized = normalize_phonenumber(phone) etl.append(data, 'phone_normalized_ss', phone_normalized) return parameters, data
def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True if 'enhance_file_meta_filename' in parameters: meta_json_file = parameters['enhance_file_meta_filename'] else: if verbose: print( 'enhance_file_meta_filename not defined in config, please add config[\'enhance_file_meta_filename\'] = \'meta.json\'' ) return parameters, data id = parameters['id'] if 'container' in parameters: id = parameters['container'] id = id.replace('file://', '', 1) directory = os.path.dirname(id) metafile = directory + '/' + meta_json_file meta = {} if os.path.isfile(metafile): if zipfile.is_zipfile(metafile): if verbose: print('is zipfile:' + metafile) with ZipFile(metafile) as myzip: fn = myzip.namelist() with myzip.open(fn[0]) as myfile: tmp = myfile.read() else: if metafile.endswith('.xz'): if verbose: print('is XZ file:' + metafile) tmp = lzma.open(metafile, mode='rt', encoding='utf-8').read() else: if verbose: print('not zip nor xz:' + metafile) tmp = open(metafile).read() try: meta = json.loads(tmp) except Exception as e: print('Exception loading ' + metafile + ' error' + e) else: #data.update(meta) for k in meta: etl.append(data, k, meta[k]) if verbose: print('meta file:' + metafile) print('meta:' + json.dumps(meta)) else: if verbose: print('file does not exist ' + metafile) return parameters, data
def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True openrefine_server = False text = '' for field in data: values = data[field] if not isinstance(values, list): values = [values] for value in values: if value: text = "{}{}\n".format(text, value) if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities(text=text) if verbose: print("Named Entity Linking: {}".format(results)) for match in results: for candidate in results[match]['result']: if candidate['match']: for facet in candidate['type']: etl.append(data, facet, candidate['name']) etl.append(data, facet + '_uri_ss', candidate['id']) # mark the document, that it was analyzed by this plugin yet data['enhance_entity_linking_b'] = "true" return parameters, data
def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True openrefine_server = False analyse_fields = [ 'title_txt', 'content_txt', 'description_txt', 'ocr_t', 'ocr_descew_t' ] text = '' for field in analyse_fields: if field in data: text = "{}{}\n".format(text, data[field]) if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities(text=text) if verbose: print("Named Entity Linking: {}".format(results)) for match in results: for candidate in results[match]['result']: for facet in candidate['type']: etl.append(data, facet, candidate['name']) etl.append(data, facet + '_uri_ss', candidate['id']) # mark the document, that it was analyzed by this plugin yet data['enhance_entity_linking_b'] = "true" return parameters, data
def regex2facet(data, text, regex, group, facet, verbose=False): if verbose: print("Checking regex {} for facet {}".format(regex, facet)) matches = re.finditer(regex, text, re.IGNORECASE) if matches: for match in matches: try: value = match.group(group) if verbose: print("Found regex {} with value {} for facet {}".format( regex, value, facet)) etl.append(data, facet, value) except BaseException as e: print( "Exception while adding value {} from regex {} and group {} to facet {}:" .format(value, regex, group, facet)) print(e.args[0])
def process(self, parameters=None, data=None): if parameters is None: parameters = {} if data is None: data = {} # todo: use all data fields for analysis text = '' if 'content_txt' in data: text = data['content_txt'] for match in re.finditer('[\w\.-]+@[\w\.-]+', text, re.IGNORECASE): value = match.group(0) etl.append(data, 'email_ss', value) # if extracted email addresses from data, do further analysis for separated specialized facets if 'email_ss' in data: # extract email adresses of sender (from) for match in re.finditer('From: (.* )?([\w\.-]+@[\w\.-]+)', text, re.IGNORECASE): value = match.group(2) etl.append(data, 'Message-From_ss', value) # extract email adresses (to) for match in re.finditer('To: (.* )?([\w\.-]+@[\w\.-]+)', text, re.IGNORECASE): value = match.group(2) etl.append(data, 'Message-To_ss', value) # extract the domain part from all emailadresses to facet email domains data['email_domain_ss'] = [] emails = data['email_ss'] if not isinstance(emails, list): emails = [emails] for email in emails: domain = email.split('@')[1] etl.append(data, 'email_domain_ss', domain) return parameters, data
def etl_graph(self, parameters): if self.verbose: print("Graph has {} triples.".format(len(self.graph)) ) count_triple = 0 count_subjects = 0 part_parameters = {} part_parameters['plugins'] = [] part_parameters['export'] = parameters['export'] property2facet = {} if 'property2facet' in parameters: property2facet = parameters['property2facet'] etl_processor = ETL() etl_processor.verbose = self.verbose class_properties = [] class_properties.append(rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type')) class_properties.append(rdflib.term.URIRef(u'http://www.wikidata.org/prop/direct/P31')) # since there can be multiple triples/values for same property, # do not overwrite document but add value to existent document & values of the facet/field/property part_parameters['add'] = True # but not for the field content_type which doesn't change and is not multi valued part_parameters['fields_set'] = "content_type" # use SPARQL query with distinct to get subjects only once res = self.graph.query( """SELECT DISTINCT ?subject WHERE { ?subject ?predicate ?object . }""") for row in res: count_subjects += 1 if self.verbose: print( "Importing entity / subject {}".format(count_subjects) ) # get subject of the concept from first column subj = row[0] if self.verbose: print ( "Processing RDF subject {}".format(subj) ) part_data = {} part_data['content_type'] = 'Knowledge graph' part_data['content_type_group'] = 'Knowledge graph' # subject as URI/ID part_parameters['id'] = subj preferred_label = self.get_preferred_label(subject=subj) part_data['title'] = preferred_label count_subject_triple = 0 # get all triples for this subject for pred, obj in self.graph.predicate_objects(subject=subj): count_triple += 1 count_subject_triple += 1 if self.verbose: print( "Importing subjects triple {}".format(count_subject_triple) ) print( "Predicate / property: {}".format(pred) ) print( "Object / value: {}".format(obj) ) try: # if class add preferredlabel of this entity to facet of its class (RDF rdf:type or Wikidata "instance of" (Property:P31)), # so its name (label) will be available in entities view and as filter for faceted search if pred in class_properties: class_facet = str(obj) # map class to facet, if mapping for class exist if class_facet in property2facet: class_facet = property2facet[class_facet] etl.append(data=part_data, facet=class_facet, values=preferred_label) # # Predicate/property to facet/field # # set Solr datatype strings so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype facet = pred + '_ss' facet_uri = facet + '_uri_ss' facet_preferred_label_and_uri = facet + '_preflabel_and_uri_ss' if self.verbose: print ( "Facet: {}".format(facet) ) # # get values or labels of this object # values = self.get_values(obj=obj) if self.verbose: print ( "Values: {}".format(values) ) # insert or append value (object of triple) to data etl.append(data=part_data, facet=facet, values=values) # if object is reference/URI append URI if type(obj) == rdflib.URIRef: uri = obj etl.append( data=part_data, facet=facet_uri, values=uri ) # append mixed field with preferred label and URI of the object for disambiguation of different Entities/IDs/URIs with same names/labels in faceted search preferredlabel_and_uri = "{} <{}>".format ( self.get_preferred_label(subject=obj), obj) else: preferredlabel_and_uri = self.get_preferred_label(subject=obj) etl.append(data=part_data, facet=facet_preferred_label_and_uri, values=preferredlabel_and_uri) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while triple {} of subject {}: {}\n".format(count_subject_triple, subj, e) ) # index triple etl_processor.process( part_parameters, part_data)
def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True entity_linking_taggers = ['all_labels_ss_tag'] if 'entity_linking_taggers' in parameters: entity_linking_taggers = parameters['entity_linking_taggers'] # add taggers for stemming entity_linking_taggers_document_language_dependent = {} if 'entity_linking_taggers_document_language_dependent' in parameters: entity_linking_taggers_document_language_dependent = parameters[ 'entity_linking_taggers_document_language_dependent'] if 'language_s' in data: # is a language specific tagger there for the detected language? if data['language_s'] in entity_linking_taggers_document_language_dependent: for entity_linking_tagger in entity_linking_taggers_document_language_dependent[ data['language_s']]: if not entity_linking_tagger in entity_linking_taggers: entity_linking_taggers.append(entity_linking_tagger) openrefine_server = False if 'openrefine_server' in parameters: openrefine_server = parameters['openrefine_server'] taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss'] # collect/copy to be analyzed text from all fields text = '' for field in data: values = data[field] if not isinstance(values, list): values = [values] for value in values: if value: text = "{}{}\n".format(text, value) # tag all entities (by different taggers for different analyzers/stemmers) for entity_linking_tagger in entity_linking_taggers: # call REST API if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities( text=text, taggers=[entity_linking_tagger], additional_result_fields=taxonomy_fields) if verbose: print("Named Entity Linking by Tagger {}: {}".format( entity_linking_tagger, results)) # write entities from result to document facets for match in results: for candidate in results[match]['result']: if candidate['match']: for facet in candidate['type']: # use different facet for fuzzy/stemmed matches if not entity_linking_tagger == 'all_labels_ss_tag': # do not use another different facet if same stemmer but forced / not document language dependent entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace( '_stemming_force_', '_stemming_') facet = facet + entity_linking_tagger_withoutforceoption + '_ss' etl.append(data, facet, candidate['name']) etl.append(data, facet + '_uri_ss', candidate['id']) etl.append( data, facet + '_preflabel_and_uri_ss', candidate['name'] + ' <' + candidate['id'] + '>') if 'matchtext' in candidate: for matchtext in candidate['matchtext']: etl.append( data, facet + '_matchtext_ss', candidate['id'] + "\t" + matchtext) for taxonomy_field in taxonomy_fields: if taxonomy_field in candidate: separated_taxonomy_fields = taxonomy2fields( field=facet, data=candidate[taxonomy_field]) for separated_taxonomy_field in separated_taxonomy_fields: etl.append( data, separated_taxonomy_field, separated_taxonomy_fields[ separated_taxonomy_field]) # mark the document, that it was analyzed by this plugin yet data['etl_enhance_entity_linking_b'] = "true" return parameters, data
def process(self, parameters=None, data=None): if parameters is None: parameters = {} if data is None: data = {} verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True if 'spacy_ner_mapping' in parameters: mapping = parameters['spacy_ner_mapping'] else: mapping = { 'ORG': 'organization_ss', 'NORP': 'organization_ss', 'PER': 'person_ss', 'PERSON': 'person_ss', 'GPE': 'location_ss', 'LOC': 'location_ss', 'FACILITY': 'location_ss', 'PRODUCT': 'product_ss', 'EVENT': 'event_ss', 'LAW': 'law_ss', 'DATE': 'date_ss', 'TIME': 'time_ss', 'MONEY': 'money_ss', 'WORK_OF_ART': 'work_of_art_ss', } # default classifier classifier = 'en_core_web_sm' if 'spacy_ner_classifier_default' in parameters: classifier = parameters['spacy_ner_classifier_default'] # set language specific classifier, if configured and document language detected if 'spacy_ner_classifiers' in parameters and 'language_s' in data: # is a language speciic cassifier there for the detected language? if data['language_s'] in parameters['spacy_ner_classifiers']: classifier = parameters['spacy_ner_classifiers'][ data['language_s']] # if standard classifier configured to None and no classifier for detected language, exit the plugin if not classifier: return parameters, data if verbose: print( "Using SpaCY NER language / classifier: {}".format(classifier)) analyse_fields = [ 'title_txt', 'content_txt', 'description_txt', 'ocr_t', 'ocr_descew_t' ] text = '' for field in analyse_fields: if field in data: text = "{}{}\n".format(text, data[field]) # classify/tag with class each word of the content url = "http://localhost:8080/ent" if os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER'): url = os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER') + '/ent' headers = {'content-type': 'application/json'} d = {'text': text, 'model': classifier} retries = 0 retrytime = 1 # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry retrytime_max = 120 no_connection = True while no_connection: try: if retries > 0: print( 'Retrying to connect to Spacy services in {} second(s).' .format(retrytime)) time.sleep(retrytime) retrytime = retrytime * 2 if retrytime > retrytime_max: retrytime = retrytime_max response = requests.post(url, data=json.dumps(d), headers=headers) # if bad status code, raise exception response.raise_for_status() no_connection = False except requests.exceptions.ConnectionError as e: retries += 1 sys.stderr.write( "Connection to Spacy services (will retry in {} seconds) failed. Exception: {}\n" .format(retrytime, e)) r = response.json() for ent in r: entity_class = ent['label'] # get entity string from returned start and end value entity = text[int(ent['start']):int(ent['end'])] # strip whitespaces from begin and end entity = entity.strip() # after strip exclude empty entities if not entity: continue # if class of entity is mapped to a facet/field, append the entity to this facet/field if entity_class in mapping: if verbose: print( "NER classified word(s)/name {} to {}. Appending to mapped facet {}" .format(entity, entity_class, mapping[entity_class])) etl.append(data, mapping[entity_class], entity) else: if verbose: print( "Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}" .format(entity_class, entity)) return parameters, data
def process (self, parameters={}, data={} ): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True if 'stanford_ner_mapping' in parameters: mapping = parameters['stanford_ner_mapping'] else: # todo: extend mapping for models with more classes like dates mapping = { 'PERSON': 'person_ss', 'LOCATION': 'location_ss', 'ORGANIZATION': 'organization_ss', 'I-ORG': 'organization_ss', 'I-PER': 'person_ss', 'I-LOC': 'location_ss', 'ORG': 'organization_ss', 'PER': 'person_ss', 'LOC': 'location_ss', 'PERS': 'person_ss', 'LUG': 'location_ss', 'MONEY': 'money_ss', } # default classifier classifier = 'english.all.3class.distsim.crf.ser.gz' if 'stanford_ner_classifier_default' in parameters: classifier = parameters['stanford_ner_classifier_default'] # set language specific classifier, if configured and document language detected if 'stanford_ner_classifiers' in parameters and 'language_s' in data: # is a language speciic cassifier there for the detected language? if data['language_s'] in parameters['stanford_ner_classifiers']: classifier = parameters['stanford_ner_classifiers'][data['language_s']] # if standard classifier configured to None and no classifier for detected language, exit the plugin if not classifier: return parameters, data kwargs={} if 'stanford_ner_java_options' in parameters: kwargs['java_options'] = parameters['stanford_ner_java_options'] if 'stanford_ner_path_to_jar' in parameters: kwargs['path_to_jar'] = parameters['stanford_ner_path_to_jar'] analyse_fields = ['title_txt','content_txt','description_txt','ocr_t','ocr_descew_t'] text = '' for field in analyse_fields: if field in data: text = "{}{}\n".format(text, data[field]) # classify/tag with class each word of the content st = StanfordNERTagger(classifier, encoding='utf8', verbose=verbose, **kwargs) entities = st.tag(text.split()) # compound words of same class to multi word entities (result is a split by class changes instead of split on single words/tokens) entities = self.multi_word_entities(entities) # if class of entity is mapped to a facet/field, append the entity to this facet/field for entity, entity_class in entities: if entity_class in mapping: if verbose: print ( "NER classified word(s)/name {} to {}. Appending to mapped facet {}".format(entity, entity_class, mapping[entity_class]) ) etl.append(data, mapping[entity_class], entity) else: if verbose: print ( "Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}".format(entity_class, entity) ) # mark the document, that it was analyzed by this plugin yet data['enhance_ner_stanford_b'] = "true" return parameters, data
def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True if 'spacy_ner_spacy' in parameters: mapping = parameters['spacy_ner_spacy'] else: mapping = { 'ORG': 'organization_ss', 'NORP': 'organization_ss', 'PER': 'person_ss', 'PERSON': 'person_ss', 'GPE': 'location_ss', 'LOC': 'location_ss', 'FACILITY': 'location_ss', 'PRODUCT': 'product_ss', 'EVENT': 'event_ss', 'LAW': 'law_ss', 'DATE': 'date_ss', 'TIME': 'time_ss', 'MONEY': 'money_ss', 'WORK_OF_ART': 'work_of_art_ss', } # default classifier classifier = 'en' if 'spacy_ner_classifier_default' in parameters: classifier = parameters['spacy_ner_classifier_default'] # set language specific classifier, if configured and document language detected if 'spacy_ner_classifiers' in parameters and 'language_s' in data: # is a language speciic cassifier there for the detected language? if data['language_s'] in parameters['spacy_ner_classifiers']: classifier = parameters['spacy_ner_classifiers'][ data['language_s']] # if standard classifier configured to None and no classifier for detected language, exit the plugin if not classifier: return parameters, data if verbose: print( "Using SpaCY NER language / classifier: {}".format(classifier)) analyse_fields = [ 'title', 'content', 'description', 'ocr_t', 'ocr_descew_t' ] text = '' for field in analyse_fields: if field in data: text = "{}{}\n".format(text, data[field]) # classify/tag with class each word of the content url = os.getenv('OSS_SPACY_URL', default='http://localhost:8000/ent') headers = {'content-type': 'application/json'} d = {'text': text, 'model': classifier} response = requests.post(url, data=json.dumps(d), headers=headers) r = response.json() for ent in r: entity_class = ent['type'] # get entity string from returned start and end value entity = text[int(ent['start']):int(ent['end'])] # strip whitespaces from begin and end entity = entity.strip() # after strip exclude empty entities if not entity: continue # if class of entity is mapped to a facet/field, append the entity to this facet/field if entity_class in mapping: if verbose: print( "NER classified word(s)/name {} to {}. Appending to mapped facet {}" .format(entity, entity_class, mapping[entity_class])) etl.append(data, mapping[entity_class], entity) else: if verbose: print( "Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}" .format(entity_class, entity)) # mark the document, that it was analyzed by this plugin yet data['enhance_ner_spacy_b'] = "true" return parameters, data
def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True if 'stanford_ner_mapping' in parameters: mapping = parameters['stanford_ner_mapping'] else: # todo: extend mapping for models with more classes like dates mapping = { 'PERSON': 'person_ss', 'LOCATION': 'location_ss', 'ORGANIZATION': 'organization_ss', 'I-ORG': 'organization_ss', 'I-PER': 'person_ss', 'I-LOC': 'location_ss', 'ORG': 'organization_ss', 'PER': 'person_ss', 'LOC': 'location_ss', 'PERS': 'person_ss', 'LUG': 'location_ss', 'MONEY': 'money_ss', } # default classifier classifier = 'english.all.3class.distsim.crf.ser.gz' if 'stanford_ner_classifier_default' in parameters: classifier = parameters['stanford_ner_classifier_default'] # set language specific classifier, if configured and document language detected if 'stanford_ner_classifiers' in parameters and 'language_s' in data: # is a language speciic cassifier there for the detected language? if data['language_s'] in parameters['stanford_ner_classifiers']: classifier = parameters['stanford_ner_classifiers'][ data['language_s']] # if standard classifier configured to None and no classifier for detected language, exit the plugin if not classifier: return parameters, data kwargs = {} if 'stanford_ner_java_options' in parameters: kwargs['java_options'] = parameters['stanford_ner_java_options'] if 'stanford_ner_path_to_jar' in parameters: kwargs['path_to_jar'] = parameters['stanford_ner_path_to_jar'] analyse_fields = [ 'title_txt', 'content_txt', 'description_txt', 'ocr_t', 'ocr_descew_t' ] text = '' for field in analyse_fields: if field in data: text = "{}{}\n".format(text, data[field]) # classify/tag with class each word of the content st = StanfordNERTagger(classifier, encoding='utf8', verbose=verbose, **kwargs) entities = st.tag(text.split()) # compound words of same class to multi word entities (result is a split by class changes instead of split on single words/tokens) entities = self.multi_word_entities(entities) # if class of entity is mapped to a facet/field, append the entity to this facet/field for entity, entity_class in entities: if entity_class in mapping: if verbose: print( "NER classified word(s)/name {} to {}. Appending to mapped facet {}" .format(entity, entity_class, mapping[entity_class])) etl.append(data, mapping[entity_class], entity) else: if verbose: print( "Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}" .format(entity_class, entity)) # mark the document, that it was analyzed by this plugin yet data['enhance_ner_stanford_b'] = "true" return parameters, data
def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True entity_linking_taggers = ['all_labels_ss_tag'] if 'entity_linking_taggers' in parameters: entity_linking_taggers = parameters['entity_linking_taggers'] # add taggers for stemming entity_linking_taggers_document_language_dependent = {} if 'entity_linking_taggers_document_language_dependent' in parameters: entity_linking_taggers_document_language_dependent = parameters[ 'entity_linking_taggers_document_language_dependent'] if 'language_s' in data: # is a language specific tagger there for the detected language? if data['language_s'] in entity_linking_taggers_document_language_dependent: for entity_linking_tagger in entity_linking_taggers_document_language_dependent[ data['language_s']]: if not entity_linking_tagger in entity_linking_taggers: entity_linking_taggers.append(entity_linking_tagger) openrefine_server = False if 'openrefine_server' in parameters: openrefine_server = parameters['openrefine_server'] taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss'] # collect/copy to be analyzed text from all fields text = '' for field in data: values = data[field] if not isinstance(values, list): values = [values] for value in values: if value: text = "{}{}\n".format(text, value) # tag all entities (by different taggers for different analyzers/stemmers) for entity_linking_tagger in entity_linking_taggers: results = {} retries = 0 retrytime = 1 retrytime_max = 120 # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry no_connection = True while no_connection: try: if retries > 0: print( 'Retrying to connect to Solr tagger in {} second(s).' .format(retrytime)) time.sleep(retrytime) retrytime = retrytime * 2 if retrytime > retrytime_max: retrytime = retrytime_max # call REST API if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) # if bad status code, raise exception r.raise_for_status() results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities( text=text, taggers=[entity_linking_tagger], additional_result_fields=taxonomy_fields) no_connection = False except KeyboardInterrupt: raise KeyboardInterrupt except requests.exceptions.ConnectionError as e: retries += 1 if openrefine_server: sys.stderr.write( "Connection to Openrefine server failed (will retry in {} seconds). Exception: {}\n" .format(retrytime, e)) else: sys.stderr.write( "Connection to Solr text tagger failed (will retry in {} seconds). Exception: {}\n" .format(retrytime, e)) except requests.exceptions.HTTPError as e: if e.response.status_code == 503: retries += 1 if openrefine_server: sys.stderr.write( "Openrefine server temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n" .format(retrytime, e)) else: sys.stderr.write( "Solr temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n" .format(retrytime, e)) elif e.response.status_code == 400: no_connection = False # if error because of empty entity index for that tagger because no entities imported yet, no error message / index as fail empty_entity_index = False try: errorstatus = e.response.json() if errorstatus['error'][ 'msg'] == 'field ' + entity_linking_tagger + ' has no indexed data': empty_entity_index = True except: pass if not empty_entity_index: etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) else: no_connection = False etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) except BaseException as e: no_connection = False etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) if verbose: print("Named Entity Linking by Tagger {}: {}".format( entity_linking_tagger, results)) # write entities from result to document facets for match in results: for candidate in results[match]['result']: if candidate['match']: for facet in candidate['type']: # use different facet for fuzzy/stemmed matches if not entity_linking_tagger == 'all_labels_ss_tag': # do not use another different facet if same stemmer but forced / not document language dependent entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace( '_stemming_force_', '_stemming_') facet = facet + entity_linking_tagger_withoutforceoption + '_ss' etl.append(data, facet, candidate['name']) etl.append(data, facet + '_uri_ss', candidate['id']) etl.append( data, facet + '_preflabel_and_uri_ss', candidate['name'] + ' <' + candidate['id'] + '>') if 'matchtext' in candidate: for matchtext in candidate['matchtext']: etl.append( data, facet + '_matchtext_ss', candidate['id'] + "\t" + matchtext) for taxonomy_field in taxonomy_fields: if taxonomy_field in candidate: separated_taxonomy_fields = taxonomy2fields( field=facet, data=candidate[taxonomy_field]) for separated_taxonomy_field in separated_taxonomy_fields: etl.append( data, separated_taxonomy_field, separated_taxonomy_fields[ separated_taxonomy_field]) # mark the document, that it was analyzed by this plugin yet data['etl_enhance_entity_linking_b'] = "true" return parameters, data
def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True entity_linking_taggers = ['all_labels_ss_tag'] if 'entity_linking_taggers' in parameters: entity_linking_taggers = parameters['entity_linking_taggers'] entity_linking_taggers_document_language_dependent = {} if 'entity_linking_taggers_document_language_dependent' in parameters: entity_linking_taggers_document_language_dependent = parameters[ 'entity_linking_taggers_document_language_dependent'] if 'language_s' in data: # is a language specific tagger there for the detected language? if data['language_s'] in entity_linking_taggers_document_language_dependent: for entity_linking_tagger in entity_linking_taggers_document_language_dependent[ data['language_s']]: if not entity_linking_tagger in entity_linking_taggers: entity_linking_taggers.append(entity_linking_tagger) openrefine_server = False if 'openrefine_server' in parameters: openrefine_server = parameters['openrefine_server'] text = '' for field in data: values = data[field] if not isinstance(values, list): values = [values] for value in values: if value: text = "{}{}\n".format(text, value) if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities(text=text, taggers=entity_linking_taggers) if verbose: print("Named Entity Linking: {}".format(results)) for match in results: for candidate in results[match]['result']: if candidate['match']: for facet in candidate['type']: etl.append(data, facet, candidate['name']) etl.append(data, facet + '_uri_ss', candidate['id']) # mark the document, that it was analyzed by this plugin yet data['enhance_entity_linking_b'] = "true" return parameters, data
def process (self, parameters={}, data={} ): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True entity_linking_taggers = ['all_labels_ss_tag'] if 'entity_linking_taggers' in parameters: entity_linking_taggers = parameters['entity_linking_taggers'] # add taggers for stemming entity_linking_taggers_document_language_dependent = {} if 'entity_linking_taggers_document_language_dependent' in parameters: entity_linking_taggers_document_language_dependent = parameters['entity_linking_taggers_document_language_dependent'] if 'language_s' in data: # is a language specific tagger there for the detected language? if data['language_s'] in entity_linking_taggers_document_language_dependent: for entity_linking_tagger in entity_linking_taggers_document_language_dependent[data['language_s']]: if not entity_linking_tagger in entity_linking_taggers: entity_linking_taggers.append(entity_linking_tagger) openrefine_server = False if 'openrefine_server' in parameters: openrefine_server = parameters['openrefine_server'] taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss'] # collect/copy to be analyzed text from all fields text = '' for field in data: values = data[field] if not isinstance(values, list): values = [values] for value in values: if value: text = "{}{}\n".format(text, value) # tag all entities (by different taggers for different analyzers/stemmers) for entity_linking_tagger in entity_linking_taggers: results = {} retries = 0 retrytime = 1 retrytime_max = 120 # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry no_connection = True while no_connection: try: if retries > 0: print('Retrying to connect to Solr tagger in {} second(s).'.format(retrytime)) time.sleep(retrytime) retrytime = retrytime * 2 if retrytime > retrytime_max: retrytime = retrytime_max # call REST API if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) # if bad status code, raise exception r.raise_for_status() results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities( text = text, taggers = [entity_linking_tagger], additional_result_fields = taxonomy_fields ) no_connection = False except KeyboardInterrupt: raise KeyboardInterrupt except requests.exceptions.ConnectionError as e: retries += 1 if openrefine_server: sys.stderr.write( "Connection to Openrefine server failed (will retry in {} seconds). Exception: {}\n".format(retrytime, e) ) else: sys.stderr.write( "Connection to Solr text tagger failed (will retry in {} seconds). Exception: {}\n".format(retrytime, e) ) except requests.exceptions.HTTPError as e: if e.response.status_code == 503: retries += 1 if openrefine_server: sys.stderr.write( "Openrefine server temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n".format(retrytime, e) ) else: sys.stderr.write( "Solr temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n".format(retrytime, e) ) elif e.response.status_code == 400: no_connection = False # if error because of empty entity index for that tagger because no entities imported yet, no error message / index as fail empty_entity_index = False try: errorstatus = e.response.json() if errorstatus['error']['msg'] == 'field ' + entity_linking_tagger + ' has no indexed data': empty_entity_index = True except: pass if not empty_entity_index: etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) else: no_connection = False etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) except BaseException as e: no_connection = False etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) if verbose: print ("Named Entity Linking by Tagger {}: {}".format(entity_linking_tagger, results)) # write entities from result to document facets for match in results: for candidate in results[match]['result']: if candidate['match']: for facet in candidate['type']: # use different facet for fuzzy/stemmed matches if not entity_linking_tagger == 'all_labels_ss_tag': # do not use another different facet if same stemmer but forced / not document language dependent entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace('_stemming_force_', '_stemming_') facet = facet + entity_linking_tagger_withoutforceoption + '_ss' etl.append(data, facet, candidate['name']) etl.append(data, facet + '_uri_ss', candidate['id']) etl.append(data, facet + '_preflabel_and_uri_ss', candidate['name'] + ' <' + candidate['id'] + '>') if 'matchtext' in candidate: for matchtext in candidate['matchtext']: etl.append(data, facet + '_matchtext_ss', candidate['id'] + "\t" + matchtext) for taxonomy_field in taxonomy_fields: if taxonomy_field in candidate: separated_taxonomy_fields = taxonomy2fields(field=facet, data=candidate[taxonomy_field]) for separated_taxonomy_field in separated_taxonomy_fields: etl.append(data, separated_taxonomy_field, separated_taxonomy_fields[separated_taxonomy_field]) # mark the document, that it was analyzed by this plugin yet data['etl_enhance_entity_linking_b'] = "true" return parameters, data
def etl_graph(self, parameters): if self.verbose: print("Graph has {} triples.".format(len(self.graph)) ) count_triple = 0 count_subjects = 0 part_parameters = {} part_parameters['plugins'] = [] part_parameters['export'] = parameters['export'] property2facet = {} if 'property2facet' in parameters: property2facet = parameters['property2facet'] etl_processor = ETL() etl_processor.verbose = self.verbose class_properties = [] class_properties.append(rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type')) class_properties.append(rdflib.term.URIRef(u'http://www.wikidata.org/prop/direct/P31')) # since there can be multiple triples/values for same property in/from different graphs or graph describes existing other file/document, # do not overwrite document but add value to existent document & values of the facet/field/property part_parameters['add'] = True # use SPARQL query with distinct to get subjects only once res = self.graph.query( """SELECT DISTINCT ?subject WHERE { ?subject ?predicate ?object . }""") for row in res: count_subjects += 1 if self.verbose: print( "Importing entity / subject {}".format(count_subjects) ) # get subject of the concept from first column subj = row[0] if self.verbose: print ( "Processing RDF subject {}".format(subj) ) part_data = {} part_data['content_type_group_ss'] = 'Knowledge graph' # subject as URI/ID part_parameters['id'] = subj preferred_label = self.get_preferred_label(subject=subj) part_data['title_txt'] = preferred_label count_subject_triple = 0 # get all triples for this subject for pred, obj in self.graph.predicate_objects(subject=subj): count_triple += 1 count_subject_triple += 1 if self.verbose: print( "Importing subjects triple {}".format(count_subject_triple) ) print( "Predicate / property: {}".format(pred) ) print( "Object / value: {}".format(obj) ) try: # if class add preferredlabel of this entity to facet of its class (RDF rdf:type or Wikidata "instance of" (Property:P31)), # so its name (label) will be available in entities view and as filter for faceted search if pred in class_properties: class_facet = str(obj) # map class to facet, if mapping for class exist if class_facet in property2facet: class_facet = property2facet[class_facet] if class_facet in parameters['facets']: part_data['content_type_ss'] = 'Knowledge graph class {}'.format(parameters['facets'][class_facet]['label']) etl.append(data=part_data, facet=class_facet, values=preferred_label) # # Predicate/property to facet/field # # set Solr datatype strings so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype facet = pred + '_ss' facet_uri = facet + '_uri_ss' facet_preferred_label_and_uri = facet + '_preflabel_and_uri_ss' if self.verbose: print ( "Facet: {}".format(facet) ) # # get values or labels of this object # values = self.get_values(obj=obj) if self.verbose: print ( "Values: {}".format(values) ) # insert or append value (object of triple) to data etl.append(data=part_data, facet=facet, values=values) # if object is reference/URI append URI if type(obj) == rdflib.URIRef: uri = obj etl.append( data=part_data, facet=facet_uri, values=uri ) # append mixed field with preferred label and URI of the object for disambiguation of different Entities/IDs/URIs with same names/labels in faceted search preferredlabel_and_uri = "{} <{}>".format ( self.get_preferred_label(subject=obj), obj) else: preferredlabel_and_uri = self.get_preferred_label(subject=obj) etl.append(data=part_data, facet=facet_preferred_label_and_uri, values=preferredlabel_and_uri) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while triple {} of subject {}: {}\n".format(count_subject_triple, subj, e) ) # index triple etl_processor.process( part_parameters, part_data)