def test_protocol_missing(): spotlight.annotate('localhost', 'asdasdasd', headers={ 'fake_response': 'invalid json', 'fake_status': 502 })
def check_spotlight(tweets_list): quiz_cand_list = [] for i in range(len(tweets_list)): text = tweets_list[i]['text'] title = tweets_list[i]['title'] try: annotations_text = spotlight.annotate(spotlight_server, text) annotations_title = spotlight.annotate(spotlight_server, title) text_surfaceform = {word['surfaceForm'] for word in annotations_text} title_surfaceform = {word['surfaceForm'] for word in annotations_title} number_set = get_number(text, title) blank_list_cand = list(text_surfaceform & title_surfaceform) blank_list = [] for j in range(len(blank_list_cand)): word = blank_list_cand[j] if Candidate_selector(word) or word.isdigit(): blank_list.append(word) for word in number_set: if word not in blank_list: blank_list.append(word) if len(blank_list) != 0: blank_cand = '_'.join(blank_list) tweets_list[i].update({'blank_cand':blank_cand}) quiz_cand_list.append(tweets_list[i]) except: pass return quiz_cand_list
def test_http_fail(): spotlight.annotate('localhost', 'asdasdasd', headers={ 'fake_response': 'invalid json', 'fake_status': 502 })
def test_http_fail(): with assert_raises(spotlight.requests.exceptions.HTTPError): spotlight.annotate('http://localhost', 'asdasdasd', headers={ 'fake_response': b'invalid json', 'fake_status': 502 })
def test_protocol_missing(): with assert_raises(spotlight.SpotlightException): spotlight.annotate('localhost', 'asdasdasd', headers={ 'fake_response': b'invalid json', 'fake_status': 502 })
def get_entities(): global df global COUNTER global entities for index, x in df.iterrows(): if COUNTER % 10 == 0: print "%s documents tagged" % COUNTER COUNTER += 1 sub = {"uri": None, "type": None, "offset": None} obj = {"uri": None, "type": None, "offset": None} # sub_type, obj_type = x.type.split("-")[0] # entities = [] # types = [] try: # shorten sentence to speedup. es = spotlight.annotate(SPOTLIGHT_URL, x['sub'], SPOTLIGHT_CONF, SPOTLIGHT_SUPPORT) eo = spotlight.annotate(SPOTLIGHT_URL, x['obj'], SPOTLIGHT_CONF, SPOTLIGHT_SUPPORT) k = es + eo for e in k: if e["surfaceForm"] == x['sub']: sub['uri'] = e['URI'].encode('utf-8') sub['type'] = [ i.encode('utf-8') for i in e["types"].split(',') if i.startswith("DBpedia") and i != "DBpedia:Agent" ] sub['offset'] = e['offset'] if e["surfaceForm"] == x['obj']: obj['uri'] = e['URI'].encode('utf-8') obj['type'] = [ i.encode('utf-8') for i in e["types"].split(',') if i.startswith("DBpedia") and i != "DBpedia:Agent" ] obj['offset'] = e['offset'] except Exception as e: print e.message try: if sub['type'] is None and x.type.split("-")[0] in types_dict: sub['type'] = types_dict[x.type.split("-")[0]] if obj['type'] is None and x.type.split("-")[1] in types_dict: obj['type'] = types_dict[x.type.split("-")[1]] except Exception as e: print e.message entities.append((sub['uri'], sub['type'], sub['offset'], obj['uri'], obj['type'], obj['offset']))
def get_entities_link(str_, language): import requests import subprocess #print "passou aqui" global count_ count_ += 1 print count_ annotations = [] #to solve float errors if (type(str_) == type(1.0)): str_ = "" else: import re #str_=cgi.escape(str_).encode('ascii', 'xmlcharrefreplace') #remove html from string # str_ = cleaner.clean_html(str_) str_ = urllib.quote_plus( cgi.escape(str_).encode('ascii', 'xmlcharrefreplace')) #print len (str_) #if len (str_) > 2000: # print str_ try: if language == "english": #annotations = spotlight.annotate('http://model.dbpedia-spotlight.org/en/annotate',str_, confidence=0.4, support=20) # url="http://localhost:8080/dexter-webapp/api/rest/annotate?min-conf=0.4&text="+str_ # contents = urllib2.urlopen(url).read() # data = json.loads(contents) url = 'http://localhost:8080/dexter-webapp/api/rest/annotate' params = {'min-conf': '0.4', 'text': str_} response = requests.post(url, data=params) data = json.loads(response.text) #print data['spots'] annotations = data['spots'] elif language == "german": annotations = spotlight.annotate( 'http://api.dbpedia-spotlight.org/de/annotate', str_, confidence=0.4, support=20) elif language == "portuguese": annotations = spotlight.annotate( 'http://api.dbpedia-spotlight.org/pt/annotate', str_, confidence=0.4, support=20) except: annotations = [] print "error trying to annotate text= " #print "passou aqui 3" #print json.dumps(annotations) return json.dumps(annotations)
def query_spotlight(key, text, failed=[]): try: res = spotlight.annotate(config.SPOTLIGHT_URL, text, confidence=config.CONFIDENCE, support=config.SUPPORT) time.sleep(config.API_LIMIT) except: res = False time.sleep(config.API_LIMIT) failed.append((key, text)) if res: relevant = [] for entry in res: # for now only take found resources #rel = entry['URI'] # for multiple fields from res rel = { entry['surfaceForm']: [entry[x] for x in config.RELEVANT_SPOTLIGHTS] } relevant.append(rel) return (True, relevant) else: return (False, [])
def annotate(self, text): for e in spotlight.annotate(self.host, text, confidence=self.confidence, support=self.support): for e_type in e["types"].split(","): if e_type.startswith("DBpedia:"): yield Entity(e["surfaceForm"], e_type.split(":")[-1], source_text=text, data={ "uri": e["URI"], "support": e["support"], "offset": e["offset"], "percentageOfSecondRank": e["percentageOfSecondRank"], "similarityScore": e["similarityScore"], "types": e["types"].split(",") }, confidence=e["similarityScore"])
def run(self, document): """ :param document: Document object :return: Document after being annotated """ #document.entities = [] for sid, (start, end) in enumerate(document.sentences_boundaries): try: annotations = spotlight.annotate(self.spotlight_url, document.text[start:end], self.confidence, self.support) except Exception as e: annotations = [] for ann in annotations: e_start = document.sentences_boundaries[sid][0] + ann['offset'] if type(ann['surfaceForm']) not in [str, unicode]: ann['surfaceForm'] = str(ann['surfaceForm']) e_end = e_start + len(ann['surfaceForm']) entity = Entity(ann['URI'], boundaries=(e_start, e_end), surfaceform=ann['surfaceForm'], annotator=self.annotator_name) document.entities.append(entity) return document
def dbpedia_get(pdf_file_name): output_folder = "C:/Users/advai/PycharmProjects/output/NER_Detection" data_folder = "C:/Users/advai/PycharmProjects/Data/" json_path = os.path.join(data_folder + "Jsons/", pdf_file_name + "/") if not len(os.listdir(json_path)): get_create_json_images(pdf_file_name) words, sentences = read_results(pdf_file_name) only_place_filter = { 'policy': "whitelist", 'types': "DBpedia:Location, DBpedia:Organization", 'coreferenceResolution': False } for i in range(len(sentences)): for j in range(len(sentences[i][1])): print(sentences[i][1][j][0]) if len(sentences[i][1][j] [0]) <= 2 or "no" in sentences[i][1][j][0].lower(): continue else: try: annotations = spotlight.annotate( 'http://15.206.75.50/rest/annotate', '{}'.format(sentences[i][1][j][0]), confidence=0.0, support=0, filters=only_place_filter) split_annotations = annotations[0]['types'].split(",") print(sentences[i][1][j][0], split_annotations) except: pass
def DbpediaResults(txt): dp_dict = {} # print('dp start') # t1 = time.time() # dp = 1 try: ano_dp = spotlight.annotate( 'http://159.226.125.180:8080/rest/annotate', txt, confidence=0.4, support=20, spotter='Default') # ano_dp = spotlight.annotate('http://api.dbpedia-spotlight.org/en/annotate', txt, confidence=0.4, support=20,spotter='Default') for a in ano_dp: if a['types'] != '': # dp_dict[a['surfaceForm']]=a['types'].split(',')[-1].split(':')[1] cla = a['types'].split(',') for c in cla: if c.startswith('DBpedia'): dp_dict[a['surfaceForm']] = c.split(':')[1].upper() break except: # print('no dbpedia results') pass # print('dp ', time.time() - t1) # print('dp end') return dp_dict
def topic_entities(doc): """Find named entities in the topic using dbpedia spotlight""" url = 'http://model.dbpedia-spotlight.org/en/annotate' only_place_filter = { 'policy': "whitelist", 'types': "schema:Place", 'coreferenceResolution': False } documents = topic_documents(doc) for index in range(len(documents)): document = documents[index] try: entities = dict() for e in spotlight.annotate(url, document, confidence=0.5, support=50): entities[e['surfaceForm']] = e['URI'] except (spotlight.SpotlightException, HTTPError): entities = {} doc['topics'][index]['entities'] = list(entities.items()) return doc
def annotations(text): try: annot = spotlight.annotate(spotlightURL, text, confidence=0.4, support=20, spotter='Default') except spotlight.SpotlightException: annot = '' except requests.exceptions.HTTPError: annot = '' triplets = [] print(annot) for elt in annot: subject = elt['URI'][len('http://dbpedia.org/resource/'):] function = 'type' try: objet = elt['types'] except KeyError: objet = '' objet = objet.split(',') objet = [ x[len('DBpedia:'):] for x in objet if x.startswith('DBpedia:') ] for o in objet: triplets.append((subject, function, o)) return triplets
def getAnnotations(textcontent, filename): try: outputname = os.path.join('./output/', filename) annotations = spotlight.annotate('http://spotlight.dbpedia.org/rest/annotate', textcontent, confidence=0.5, support=20) subjects = [] for i in annotations: uri = i['URI'] subject = uri.split('/')[-1] subjects.append(subject) counts = {} for word in subjects: if word in counts: counts[word] += 1 else: counts[word] = 1 outfile = open(outputname, 'w+') for k, v in counts.iteritems(): if v >= 10: line = k + '\n' outfile.write(line) outfile.close() transcripts.remove(filename) print len(transcripts), "left to annotate" except: print "sorry, the annotation failed for:", filename
def spotlightSearch(term): spotlightTerms = [] words = TextBlob(term).words #print words annotations = spotlight.annotate('http://spotlight.sztaki.hu:2222/rest/annotate', term, confidence=0.5, support=20, spotter='Default') print annotations for word in words: try: #print word, '\t', '\t', (wn.synset(word+'.n.01').definition()), '\t',(wn.synset(word+'.n.01').hypernyms() ) spotlightTerms.append(word) spotlightTerms.append(annotations[0].get('URI')) spotlightTerms.append(wn.synset(word+'.n.01').definition()) spotlightTerms.append(wn.synset(word+'.n.01').hypernyms()) #spotlightTerms.append(wn.synset(word + '.n.01').hyponyms()) #print "inside fucntion", word, annotations[0].get('URI') except: #print word, '\t', "Nothing" pass #print spotlightTerms return annotations #term = "sound of Massive Attack" #spotlightSearch(term)
def dbpedia_annoations(inp_db): restAPI = 'http://api.dbpedia-spotlight.org/en/annotate' reqk = [] inp_word = inp_db.split() try: annotation = spotlight.annotate(restAPI, inp_db, confidence=0.09, support=20) for terms in annotation: uniterms = unicodedata.normalize('NFKD', terms['URI']).encode( 'ascii', 'ignore') #print(uniterms) sem_key = str(uniterms).split('/')[-1][0:-1].lower() #print (sem_key) if sem_key in inp_word and sem_key != 'the_who': reqk.append(str(uniterms).split('/')[-1][0:-1]) else: if sem_key != 'the_who': sem_key = sem_key.replace('_', ' ') for xs in inp_word: if xs[-1] == '?' or xs[-1] == '.': xs = xs[:-1] #print('DBp anno: '+sem_key,xs) if sem_key.startswith(xs.lower()) or xs.lower( ).startswith(sem_key) or sem_key.endswith(xs.lower()): reqk.append(str(uniterms).split('/')[-1][0:-1]) break except: e = 'no annoation find in DBpedia' #print (e) return reqk
def get_entities_by_line(nlp, line): try: annotations = spotlight.annotate( 'http://api.dbpedia-spotlight.org/en/annotate', line, confidence=0.4, support=20) # annotations = spotlight.annotate('http://model.dbpedia-spotlight.org/en/annotate', line, # confidence=0.4, support=20) entities = list() for re_ano in annotations: entity = dict() entity['URI'] = "<" + re_ano['URI'] + ">" entity['surfaceForm'] = re_ano['surfaceForm'] entity['types'] = "" for ent in nlp(line).entities: if str(ent) == entity['surfaceForm']: entity['types'] = str(ent.type) if len(entity['types']) == 0: continue entity['start'] = re_ano['offset'] entity['end'] = entity['start'] + len(entity['surfaceForm']) entities.append(entity) # for entity in nlp(line).entities: # for entity in entities return entities except: print(line) print("spotlight exception") return ""
def main(input_folder, output_folder): list_of_files = os.listdir(input_folder) for file_name in list_of_files: print("Processing " + file_name + "...") inp_file = open(input_folder + file_name, 'r') location_names = inp_file.readlines() location_names_unique = set([x.strip() for x in location_names]) out_file = open(output_folder + file_name, 'w') print(len(location_names_unique)) for location_name in location_names_unique: try: only_place_filter = { 'policy': "whitelist", 'types': "DBpedia:Place", 'coreferenceResolution': False } dbpedia_output = spotlight.annotate( "http://api.dbpedia-spotlight.org/en/annotate", location_name.strip(), filters=only_place_filter) curr_row = [] curr_row.append(location_name) curr_row.append(dbpedia_output[0]['URI']) curr_row.append(str(dbpedia_output[0]['similarityScore'])) out_file.write("\t".join(curr_row) + "\n") except: continue out_file.close()
def get_linked_entities_spotlight(facts): sparql = SPARQLWrapper("http://dbpedia.org/sparql") for fact in facts: print(fact.text) try: annotations = spotlight.annotate( 'http://model.dbpedia-spotlight.org/en/annotate', fact.text, confidence=0.4, support=20) except spotlight.SpotlightException as e: print('No annotaions') continue fact.set_entities(annotations) for annot in annotations: query_string = (""" PREFIX foaf: <http://xmlns.com/foaf/0.1/> SELECT ?isPrimaryTopicOf WHERE { <%s> foaf:isPrimaryTopicOf ?isPrimaryTopicOf } """) % annot['URI'] sparql.setQuery(query_string) sparql.setReturnFormat(JSON) results = sparql.query().convert() for result in results["results"]["bindings"]: fact.set_wp_link(result["isPrimaryTopicOf"]["value"])
def annotate_dbpedia(text, confidence, sport): annotations_list = [] try: annotations = spotlight.annotate('https://api.dbpedia-spotlight.org/en/annotate', text, confidence=confidence) for annotation in annotations: uri = annotation['URI'] # Recupero il tipo più specifico (ultimo della lista) e converto da CamelCase a stringa normale type = re.sub("([a-z])([A-Z])", "\g<1> \g<2>", annotation['types'].rsplit(',', 1)[-1].rsplit(':', 1)[-1]) # Ricerca nome e abstract su DBpedia a partire dall'URI identificato try: name, abstract = query_dbpedia(uri) if not name: # Se il nome non esisto uso l'identificativo dell'entità nel testo name = annotation['surfaceForm'] if not abstract: # Se l'abstract non esiste lascio il campo vuoto abstract = "" except (TypeError, requests.exceptions.HTTPError) as error: print("ERROR: {} {}".format(uri, error)) name = annotation['surfaceForm'] abstract = "" # Come URI mantengo solo l'ultima parte (in lowercase) dell'URI originale uri = uri.rsplit('/', 1)[-1] # Sostituisco l'URI dell'entità nel testo text = re.sub(r'\b%s\b' % (annotation['surfaceForm']), uri, text) if type: annotations_list.append(name + ":" + type) else: annotations_list.append(name) # Memorizzo l'entità identificata nel DB persist(sport, uri.lower(), name, abstract, type) except (spotlight.SpotlightException, requests.exceptions.HTTPError) as error: print("DBPEDIA ERROR: {}".format(error)) return text, annotations_list
def get_named_entities(self, entity_type="PERSON"): try: annotations = spotlight.annotate('http://model.dbpedia-spotlight.org/pt/annotate', self.text, confidence=0.4, support=20) except (ValueError, spotlight.SpotlightException, requests.exceptions.HTTPError, requests.exceptions.ConnectionError): return [] allowed_types = {"Schema:Person", "DBpedia:Person", "Http://xmlns.com/foaf/0.1/Person"} result = set() for annotation in annotations: types = set(annotation["types"].split(",")) is_person = reduce(lambda x, y: x or y, [a in types for a in allowed_types]) if is_person: graph_node = annotation["surfaceForm"].upper() graph_node = normalize('NFKD', graph_node).encode('ASCII', 'ignore').decode('ASCII') result.add(graph_node) return result
def dbpedia_extract_spans(line): validate = True threshold = 0.5 text = line.strip() nps = [] tokens = text.split(' ') try: token_offset_to_index = get_offset_to_index_dict(text) annotations = spotlight.annotate('http://localhost:2222/rest/annotate', line, confidence=threshold) for annotation in annotations: offset = annotation['offset'] surfaceForm = annotation['surfaceForm'] spaceNum = len(re.findall(' ', surfaceForm)) try: st = token_offset_to_index[offset] ed = st + spaceNum + 1 span = {'st': st, 'ed': ed, 'text': surfaceForm} if ' '.join(tokens[st:ed]) == surfaceForm: nps.append(span) except KeyError as e: pass if validate: if not validate_nps(nps, tokens): pass # ipdb.set_trace(); except (SpotlightException, HTTPError) as e: pass except Exception as e: print(e) # ipdb.set_trace(); return nps
def get_linked_entity(text, confidence=0.5): annotations = spotlight.annotate( 'http://api.dbpedia-spotlight.org:2226/rest/annotate', text, confidence, support=20, spotter='Default') return annotations
def findMatchesFromDBPedia(requestParameter): "This function finds details from DBPedia Spotlight" annotations = spotlight.annotate('http://spotlight.sztaki.hu:2222/rest/annotate',requestParameter, confidence=0.4, support=0,spotter='Default') matches = annotations[0]['types'] print(matches[matches.rfind(':')+1:]) searchMatches = searchLucene(matches[matches.rfind(':')+1:]) print len(searchMatches) return searchMatches
def get_dbp_id(text, confidence=0.4, support=20): annotations = spotlight.annotate('http://spotlight.dbpedia.org/rest/annotate', text, confidence=confidence, support=support) dbpUri_list = [] for a in annotations: dbpUri_list.append(a['URI']) return dbpUri_list
def extract_concepts(text): try: return spotlight.annotate("http://127.0.0.1:2229/rest/annotate", text, confidence=0.5, support=100) except Exception as e: return []
def post(self): response_data = spotlight.annotate( 'https://api.dbpedia-spotlight.org/en/annotate', request.json.get('data')) response = app.response_class(response=json.dumps(response_data), status=200, mimetype='application/json') return response
def process_spotlight_api(text): try: entities = spotlight.annotate( "http://spotlight.dbpedia.org/rest/annotate", text, confidence=0.1, support=0 ) except: return [] link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity in entities: occ = 0 if occ is not 0: occ = text.count('"', 0, entity["offset"] + len(entity["serviceForm"]) - 1) start = entity["offset"] + occ end = entity["offset"] + len(entity["surfaceForm"]) + occ possible_link = False for link_match in link_matches: if link_match["start"] <= start and link_match["end"] >= end: possible_link = True if not possible_link: e = { "label": entity["surfaceForm"], "startOffset": start, "endOffset": end, "confidence": entity["similarityScore"], "provenance": "dbpediaspotlight", "types": [] } types = [] for data_type in entity["types"].split(","): link = data_type if "DBpedia:" in data_type: link = "http://en.dbpedia.org/resource/" + data_type.split(":")[1] if "Freebase:" in data_type: link = "http://www.freebase.com" + data_type.split(":")[1] dbpedia_type = { "typeURI": None, "typeLabel": data_type, "entityURI": link, "confidence": entity["similarityScore"], "wikiURI": DbpediaLink.get_english_wikipedia_link_from_english_resource(link) } types.append(dbpedia_type) e["types"].append(types) initial_entities.append(e) return initial_entities
def retrieve_entities(text): annotations = spotlight.annotate( host, text, confidence=0, support=0, spotter='Default', ) return annotations
def complexQuery(term): #test = json.load(urllib2.urlopen("http://www.freesound.org/apiv2/search/text/?query="+term+"&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp")) #test2 = json.load(urllib2.urlopen("https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name="+term)) annotator = Annotator() dep_parse = annotator.getAnnotations(term, dep_parse=True)['dep_parse'] dp_list = dep_parse.split('\n') #spotlightTerms = WordNet.spotlightSearch(term) #print "spotlight terms %s" %spotlightTerms #print "dp list %s" %dp_list spotlightTerms = spotlight.annotate( 'http://spotlight.sztaki.hu:2222/rest/annotate', term, confidence=0.3, support=20, spotter='Default') #print term, '\t', spotlightTerms[1].get('URI') #print spotlightTerms[0].get('URI') secondDep = "" query = [] for prep in dp_list: elementPrep = "prep" if elementPrep in prep: print("We found preposition1: %s" % prep[prep.find("_") + 1:prep.find("(")]) prepType = prep[prep.find("_") + 1:prep.find("(")] print("We found preposition2: %s" % prep[prep.find(" ") + 1:prep.find(")")]) secondDep = prep[prep.find(" ") + 1:prep.find(")")].split("-") print secondDep[0] query.append(prepType) query.append(secondDep[0]) if prepType == "like": results = DBPedia.dpbediaQuery(prepType, secondDep[0]) else: results = DBPedia.dpbediaQuery(prepType, spotlightTerms[1].get('URI')) print results for query in results: test = json.load( urllib2.urlopen( "http://www.freesound.org/apiv2/search/text/?query=" + query + "&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp")) test2 = json.load( urllib2.urlopen( "https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name=" + query)) print(test) #print(test2) return test, test2
def get_response(self, text): """@rtype: dict""" try: annotations = spotlight.annotate(self.pars["url"], text, confidence=self.pars["minconf"]) except spotlight.SpotlightException, msg: print "SpotlightException: {}".format(msg) return {}
def get_annotations(text): try: return spotlight.annotate(annotations_host, text, confidence=-confidence_level, support=support_level, filters=filters) except spotlight.SpotlightException: return "No annotations found"
def enrich(graph): for uri in graph.subjects(predicate=RDF.type, object=URIRef('http://www.bbc.co.uk/search/schema/ContentItem')): for desc in graph.objects(predicate=URIRef('http://schema.org/description')): try: annotations = spotlight.annotate('http://spotlight.dbpedia.org/rest/annotate', str(desc), confidence=0.4, support=20) for tag in annotations: graph.add((URIRef(uri), URIRef('http://www.bbc.co.uk/search/schema/tag'), URIRef(tag['URI']))) except spotlight.SpotlightException: continue
def run_query(query): confidence = 0.2 support = 15 try: annotations = sp.annotate('http://spotlight.dbpedia.org/rest/annotate',query,confidence,support) except: print "No resources returned" annotations = [] return annotations
def getDbpediaMatches(requestParameterSelf, requestParameterForward): matchingWords = "" try: annotations = spotlight.annotate('http://spotlight.sztaki.hu:2222/rest/annotate', requestParameterSelf, confidence=0.4, support=0,spotter='Default') matches = annotations[0]['types'] typeofline = matches[matches.rfind(':')+1:] matchingWords = matchingWords + typeofline except: "" matchingWords = getWordNetMatches(requestParameterForward,matchingWords) return matchingWords
def get_spotlight_annotation(text, lang="fr"): import spotlight try: annotations = spotlight.annotate('http://spotlight.sztaki.hu:{}/rest/annotate'.format(LANG_PORTS[lang]), text, confidence=0.6, support=20, spotter='Default') except: print "could not get info from spotlight" print text return [] return annotations
def getAnnotation(text): annotations = spotlight.annotate('http://spotlight.dbpedia.org/rest/annotate',text,confidence=0.25, support=40) annotationsSorted = sorted(annotations, key=lambda k: k['similarityScore']) setSpotlight=set(map(lambda x:x['URI'],annotationsSorted)) """ { u'URI': u'http://dbpedia.org/resource/People', u'offset': 321, u'percentageOfSecondRank': -1.0, u'similarityScore': 0.08647863566875458, u'support': 426, u'surfaceForm': u'people', u'types': u'DBpedia:TopicalConcept'} """ alchemyapi = AlchemyAPI() response = alchemyapi.entities('text', text, {'sentiment': 1}) resFilt=filter(lambda x: 'disambiguated' in x, response['entities']) key=['dbpedia','geonames','yago','opencyc'] resFilt entitySet=set() for r in resFilt: for k in key: if k in r['disambiguated']: entitySet.add(r['disambiguated'][k]) """ {u'count': u'1', u'disambiguated': {u'dbpedia': u'http://dbpedia.org/resource/Kathmandu', u'freebase': u'http://rdf.freebase.com/ns/m.04cx5', u'geo': u'27.716666666666665 85.36666666666666', u'geonames': u'http://sws.geonames.org/1283240/', u'name': u'Kathmandu', u'subType': [u'TouristAttraction'], u'website': u'http://www.kathmandu.gov.np/', u'yago': u'http://yago-knowledge.org/resource/Kathmandu'}, u'relevance': u'0.33', u'sentiment': {u'type': u'neutral'}, u'text': u'Kathmandu', u'type': u'City'}, """ entitySet.update(setSpotlight) return entitySet,annotationsSorted,response
def main(): entities_recognized = [] resp = solr_db.select(query) DOCS_RETRIEVED = DOCS_PREVIOUSLY_ENHANCED = 0 n_enhance_attempts = 0 try: while (resp): for doc in resp.results: DOCS_RETRIEVED += 1 if doc.has_key('entity_ss'): DOCS_PREVIOUSLY_ENHANCED +=1 doc doc_up = {'id':doc['id'], 'spotlighted_b':{'update':'true'}} try: solr_db.add(doc_up) except SolrException, e: if not e.httpcode == 400: raise e continue if doc.has_key(FIELD_TO_ENHANCE): for fvalue in doc[FIELD_TO_ENHANCE]: n_enhance_attempts += 1 if n_enhance_attempts % 100 == 0: print "NDOCS:", str(DOCS_RETRIEVED), ' -> ', FIELD_TO_ENHANCE, fvalue.encode('utf-8') #TODO: run each enhancer, get entity data then set the # the entity_ss using update syntax try: annotations = spotlight.annotate('http://spotlight.dbpedia.org/rest/annotate', fvalue) except spotlight.SpotlightException, e: exception_resp.append(e) if not "No Resources found" in e.message: print "NUM:", str(DOCS_RETRIEVED), " EEEE->", str(e) print e.args, e.message raise e except ConnectionError, e: time.sleep(1800) continue except HTTPError, e: #TODO: logger continue
def annotate_posts(text): annotations = spotlight.annotate(DBPEDIA_URL, text, confidence = CONFIDENCE, support = SUPPORT) return annotations
def test_candidates_invalid_json(): spotlight.annotate('http://localhost', 'asdasdasd', headers={'fake_response': 'invalid json'})
def test_missing_resources(): spotlight.annotate('http://localhost', 'asdasdasd', headers={'fake_response': '{"Test": "Win"}'})
def test_http_fail(): spotlight.annotate('http://localhost', 'asdasdasd', headers={'fake_response': 'invalid json', 'fake_status': 502})
def test_protocol_missing(): spotlight.annotate('localhost', 'asdasdasd', headers={'fake_response': 'invalid json', 'fake_status': 502})
def test_protocol_missing(): with assert_raises(spotlight.SpotlightException): spotlight.annotate('localhost', 'asdasdasd', headers={'fake_response': b'invalid json', 'fake_status': 502})
def _recognizeEntities(self, text): annotationServiceUri = 'http://spotlight.dbpedia.org/rest/annotate' confidence = 0.5 support = 20 return spotlight.annotate(annotationServiceUri, text, confidence=confidence, support=support)
def test_annotation_invalid_json(): spotlight.annotate('localhost', 'asdasdasd', headers={'fake_response': 'invalid json'})
LANG_PORTS = { "english": '2222', "german": '2226', "dutch": '2232', "hungarian": '2229', "french": '2225', "portuguese": '2228', "italian": '2230', "russian": '2227', "turkish": '2235', "spanish": '2231' } port = LANG_PORTS["english"] url ="http://spotlight.sztaki.hu:%s/rest/annotate" % port input_filename = sys.argv[1] output_filename = sys.argv[2] text = "This is a test with Berlin" with open(input_filename,'r') as f_in, open(output_filename, 'w') as f_out: article = json.load(f_in) text = article["title"] + " \n " + article["abstract"] try: annotations = spotlight.annotate( url, text, spotter="Default", disambiguator="Default", confidence=0.5, support=0) except Exception, e: logging.error("%s %s" % (input_filename, str(e))) annotations = [] json.dump(annotations, f_out, indent=2)
def annotateHTML(html): annotation = spotlight.annotate("http://spotlight.dbpedia.org/rest/annotate",html) listeURI = [] for i in range(len(annotation)): listeURI += [annotation[i]['URI']] return listeURI
def test_candidates_invalid_json(): with assert_raises(ValueError): spotlight.annotate('http://localhost', 'asdasdasd', headers={'fake_response': b'invalid json'})
def test_missing_resources(): with assert_raises(spotlight.SpotlightException): spotlight.annotate('http://localhost', 'asdasdasd', headers={'fake_response': b'{"Test": "Win"}'})
def test_http_fail(): with assert_raises(spotlight.requests.exceptions.HTTPError): spotlight.annotate('http://localhost', 'asdasdasd', headers={'fake_response': b'invalid json', 'fake_status': 502})
def annotate(text): return spotlight.annotate( "http://spotlight.sztaki.hu:2222/rest/annotate", text, confidence=0.30, support=30, spotter="Default" )
def linking(self, query): annotations = spotlight.annotate(self.uri,query,confidence=self.confidence, support=self.support) annotations = [a['URI'] for a in annotations] return annotations