def process_spotlight_api(text): try: entities = spotlight.annotate( "http://spotlight.dbpedia.org/rest/annotate", text, confidence=0.1, support=0 ) except: return [] link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity in entities: occ = 0 if occ is not 0: occ = text.count('"', 0, entity["offset"] + len(entity["serviceForm"]) - 1) start = entity["offset"] + occ end = entity["offset"] + len(entity["surfaceForm"]) + occ possible_link = False for link_match in link_matches: if link_match["start"] <= start and link_match["end"] >= end: possible_link = True if not possible_link: e = { "label": entity["surfaceForm"], "startOffset": start, "endOffset": end, "confidence": entity["similarityScore"], "provenance": "dbpediaspotlight", "types": [] } types = [] for data_type in entity["types"].split(","): link = data_type if "DBpedia:" in data_type: link = "http://en.dbpedia.org/resource/" + data_type.split(":")[1] if "Freebase:" in data_type: link = "http://www.freebase.com" + data_type.split(":")[1] dbpedia_type = { "typeURI": None, "typeLabel": data_type, "entityURI": link, "confidence": entity["similarityScore"], "wikiURI": DbpediaLink.get_english_wikipedia_link_from_english_resource(link) } types.append(dbpedia_type) e["types"].append(types) initial_entities.append(e) return initial_entities
def process_textrazor_api(text): client = TextRazor( api_key='67ef1ca06614f7d202b23f1444bd7ee1ea2f916b3ecf488f8d39f800', extractors=[ "entities", "topics", "words", "phrases", "dependency-trees", "senses" ] ) try: response = client.analyze(text) except: return [] link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity in response.entities(): possible_link = False for link_match in link_matches: if link_match["start"] <= entity.starting_position and link_match["end"] >= entity.ending_position: possible_link = True if not possible_link: e = { "label": entity.matched_text, "startOffset": entity.starting_position, "endOffset": entity.ending_position, "confidence": entity.confidence_score, "relevance": entity.relevance_score, "provenance": "textrazor", "wikipediaLink": entity.wikipedia_link, "types": [] } for dbpedia_type in entity.dbpedia_types: wiki_link = "http://en.wikipedia.org/wiki/" + dbpedia_type dbpedia_type_list = { "typeURI": None, "typeLabel": dbpedia_type, "wikiURI": wiki_link, "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(wiki_link), "confidence": entity.confidence_score } e["types"].append(dbpedia_type_list) for freebase_type in entity.freebase_types: freebase_link = "http://www.freebase.com" + freebase_type freebase_type_list = { "typeURI": None, "typeLabel": "Freebase:" + freebase_type.replace(" ", ""), "wikiURI": None, "entityURI": freebase_link, "confidence": entity.confidence_score } e["types"].append(freebase_type_list) wiki_type_list = { "typeURI": None, "typeLabel": [], "wikiURI": entity.wikipedia_link, "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity.wikipedia_link), "confidence": entity.confidence_score } e["types"].append(wiki_type_list) initial_entities.append(e) return initial_entities
def process_textrazor_api(text): client = TextRazor( api_key="67ef1ca06614f7d202b23f1444bd7ee1ea2f916b3ecf488f8d39f800", extractors=["entities", "topics", "words", "phrases", "dependency-trees", "senses"], ) try: response = client.analyze(text) except: return [] link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity in response.entities(): possible_link = False for link_match in link_matches: if link_match["start"] <= entity.starting_position and link_match["end"] >= entity.ending_position: possible_link = True if not possible_link: e = { "label": entity.matched_text, "startOffset": entity.starting_position, "endOffset": entity.ending_position, "confidence": entity.confidence_score, "relevance": entity.relevance_score, "provenance": "textrazor", "wikipediaLink": entity.wikipedia_link, "types": [], } for dbpedia_type in entity.dbpedia_types: wiki_link = "http://en.wikipedia.org/wiki/" + dbpedia_type dbpedia_type_list = { "typeURI": None, "typeLabel": dbpedia_type, "wikiURI": wiki_link, "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(wiki_link), "confidence": entity.confidence_score, } e["types"].append(dbpedia_type_list) for freebase_type in entity.freebase_types: freebase_link = "http://www.freebase.com" + freebase_type freebase_type_list = { "typeURI": None, "typeLabel": "Freebase:" + freebase_type.replace(" ", ""), "wikiURI": None, "entityURI": freebase_link, "confidence": entity.confidence_score, } e["types"].append(freebase_type_list) wiki_type_list = { "typeURI": None, "typeLabel": [], "wikiURI": entity.wikipedia_link, "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity.wikipedia_link), "confidence": entity.confidence_score, } e["types"].append(wiki_type_list) initial_entities.append(e) return initial_entities
def process_spotlight_api(text): try: entities = spotlight.annotate( "http://spotlight.dbpedia.org/rest/annotate", text, confidence=0.1, support=0) except: return [] link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity in entities: occ = 0 if occ is not 0: occ = text.count('"', 0, entity["offset"] + len(entity["serviceForm"]) - 1) start = entity["offset"] + occ end = entity["offset"] + len(entity["surfaceForm"]) + occ possible_link = False for link_match in link_matches: if link_match["start"] <= start and link_match["end"] >= end: possible_link = True if not possible_link: e = { "label": entity["surfaceForm"], "startOffset": start, "endOffset": end, "confidence": entity["similarityScore"], "provenance": "dbpediaspotlight", "types": [] } types = [] for data_type in entity["types"].split(","): link = data_type if "DBpedia:" in data_type: link = "http://en.dbpedia.org/resource/" + data_type.split( ":")[1] if "Freebase:" in data_type: link = "http://www.freebase.com" + data_type.split(":")[1] dbpedia_type = { "typeURI": None, "typeLabel": data_type, "entityURI": link, "confidence": entity["similarityScore"], "wikiURI": DbpediaLink. get_english_wikipedia_link_from_english_resource(link) } types.append(dbpedia_type) e["types"].append(types) initial_entities.append(e) return initial_entities
def process_nerd_api(text): try: timeout = 10 text = urllib.quote_plus(text) n = nerd.NERD("nerd.eurecom.fr", 'akkqfgos0p85mcubcfgp82rn92d23enu') entities = n.extract(text, "combined", timeout) except: return [] link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity in entities: possible_link = False for link_match in link_matches: if link_match["start"] <= entity["startChar"] and link_match[ "end"] >= entity["endChar"]: possible_link = True if not possible_link: e = { "label": entity["label"], "startOffset": entity["startChar"], "endOffset": entity["endChar"], "confidence": entity["confidence"], "provenance": "nerd-" + entity["extractor"], "types": [] } if entity["extractorType"]: all_types = entity["extractorType"].split(",") for extracted_type in all_types: if "dbpedia" in extracted_type: type_data = { "typeURI": extracted_type, "typeLabel": None, "wikiURI": DbpediaLink. get_english_resource_from_english_wikipedia_link( entity["uri"]), "entityURI": entity["uri"], "confidence": entity["confidence"] } else: type_data = { "typeURI": None, "typeLabel": extracted_type, "wikiURI": DbpediaLink. get_english_resource_from_english_wikipedia_link( entity["uri"]), "entityURI": entity["uri"], "confidence": entity["confidence"] } e["types"].append(type_data) if entity["nerdType"]: nerd_type_data = { "typeURI": entity["nerdType"], "typeLabel": entity["nerdType"].split("#")[1], "wikiURI": DbpediaLink. get_english_resource_from_english_wikipedia_link( entity["uri"]), "entityURI": entity["uri"], "confidence": entity["confidence"] } e["types"].append(nerd_type_data) initial_entities.append(e) return initial_entities
def process_tagme_api(text): try: lang = "en" url = "http://tagme.di.unipi.it/tag?" include_categories = "true" include_all_spots = "true" api_key = read_api_key() link_matches = HyperLink.extract_all_url(text) request = "key=" + api_key + \ "&include_categories=" + include_categories + \ "&lang=" + lang + \ "&include_all_spots=" + include_all_spots + \ "&text=" + urllib.quote_plus(text) entities = requests.post(url + request) entities = json.loads(entities.text) except: return [] initial_entities = [] for entity in entities["annotations"]: possible_link = False for link_match in link_matches: if link_match["start"] <= entity["start"] and link_match[ "end"] >= entity["end"]: possible_link = True if not possible_link: e = { "label": entity["spot"], "startOffset": entity["start"], "endOffset": entity["end"], "confidence": entity["rho"], "provenance": "tagme", "types": [] } types = [] for data_type in entity["dbpedia_categories"]: wiki_url = "http://en.wikipedia.org/wiki/" + data_type.replace( " ", "_") dbpedia_type = { "typeURI": None, "typeLabel": data_type, "entityURI": DbpediaLink. get_english_resource_from_english_wikipedia_link(wiki_url), "confidence": None, "wikiURI": wiki_url } types.append(dbpedia_type) e["types"].append(types) initial_entities.append(e) initial_entities = {v["label"]: v for v in initial_entities}.values() return initial_entities
def process_nerd_api(text): try: timeout = 10 text = urllib.quote_plus(text) n = nerd.NERD("nerd.eurecom.fr", "akkqfgos0p85mcubcfgp82rn92d23enu") entities = n.extract(text, "combined", timeout) except: return [] link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity in entities: possible_link = False for link_match in link_matches: if link_match["start"] <= entity["startChar"] and link_match["end"] >= entity["endChar"]: possible_link = True if not possible_link: e = { "label": entity["label"], "startOffset": entity["startChar"], "endOffset": entity["endChar"], "confidence": entity["confidence"], "provenance": "nerd-" + entity["extractor"], "types": [], } if entity["extractorType"]: all_types = entity["extractorType"].split(",") for extracted_type in all_types: if "dbpedia" in extracted_type: type_data = { "typeURI": extracted_type, "typeLabel": None, "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]), "entityURI": entity["uri"], "confidence": entity["confidence"], } else: type_data = { "typeURI": None, "typeLabel": extracted_type, "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]), "entityURI": entity["uri"], "confidence": entity["confidence"], } e["types"].append(type_data) if entity["nerdType"]: nerd_type_data = { "typeURI": entity["nerdType"], "typeLabel": entity["nerdType"].split("#")[1], "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]), "entityURI": entity["uri"], "confidence": entity["confidence"], } e["types"].append(nerd_type_data) initial_entities.append(e) return initial_entities
def process_tdh_api(text): url = "http://entityclassifier.eu/thd/api/v2/extraction?" lang = "en" content_format = "json" entity_types = ["ne", "ce"] priority_entity_linking = "true" api_key = read_api_key() link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity_type in entity_types: request = "apikey=" + api_key + \ "&format=" + content_format + \ "&lang=" + lang + \ "&priority_entity_linking=" + priority_entity_linking + \ "&entity_type=" + entity_type try: entities = requests.post(url + request, urllib.quote_plus(text)) entities = json.loads(entities.text) except: return [] for entity in entities: possible_link = False for link_match in link_matches: if link_match["start"] <= entity["startOffset"] and link_match[ "end"] >= entity["endOffset"]: possible_link = True if not possible_link: e = { "label": entity["underlyingString"], "startOffset": entity["startOffset"], "endOffset": entity["endOffset"], "confidence": None, "provenance": "thd", "types": [] } types = [] try: for data_type in entity["types"]: thd_type = { "typeURI": data_type["typeURI"], "typeLabel": data_type["typeLabel"], "entityURI": data_type["entityURI"], "confidence": data_type["salience"]["confidence"], "wikiURI": DbpediaLink. get_english_wikipedia_link_from_english_resource( data_type["entityURI"]), } types.append(thd_type) e["types"].append(types) except KeyError: continue initial_entities.append(e) initial_entities = {v["label"]: v for v in initial_entities}.values() return initial_entities