class DandelionAnnotator: def __init__(self, app_id, app_key): self.app_id = app_id self.app_key = app_key self.datatxt = DataTXT(app_id=self.app_id, app_key=self.app_key) def dandelion_annotation(self, string): """ Gets a string, annotates it, and returns the annotated version with the entities inside :param string: :return: """ response = self.datatxt.nex(string, include_lod=True) annotated_string = string shift = 0 for annotation in response.annotations: start = annotation["start"] end = annotation["end"] print(shift) annotated_string = annotated_string[:start + shift] + replace_dbpedia( annotation["lod"].dbpedia ) + annotated_string[shift + end:] print(annotated_string) shift = shift + len(replace_dbpedia(annotation["lod"].dbpedia)) - ( annotation["end"] - annotation["start"]) return annotated_string
def saveAnnotation(id,text,db): print id if db.simDoc.find({'_id':id}).count() == 0: #print entity.id, entity.relevance_score, entity.confidence_score, entity.freebase_types, entity.wikidata_id wikidataEntities,dbpediaEntities=get_annotation_text_razor(text) datatxt = DataTXT(app_id='0b2b87bc', app_key='7f0ae25400535758e9ceae358b3db763') result =datatxt.nex(text.decode('latin-1'),include_lod=True,language='en')['annotations'] #pprint(result) entityDbpediaSet=set() entityDbpedia=[] print result for entity in result: print entity if 'lod' in entity and 'dbpedia' in entity['lod'] and entity['lod']['dbpedia'] not in entityDbpediaSet: entityDbpedia.append({'dbpedia_id':entity['lod']['dbpedia'], 'confidence':entity['confidence']}) entityDbpediaSet.add(entity['lod']['dbpedia']) #entitySetWikidata=set(map(lambda x: x['lod']['wikidata'],result)) #pprint(entitySetDbpedia) print "dbpedia %s wikidata %s"%(len(entityDbpedia),len(wikidataEntities)) db.simDoc.insert({'_id':id,'text':text.decode('utf-8','ignore'), 'entities_dbpedia':entityDbpedia, 'entities_wikidata':wikidataEntities, 'entities_dbpedia_razor':dbpediaEntities})
class TestDatatxt(TestCase): def setUp(self): default_config['app_id'] = os.environ['APP_ID'] default_config['app_key'] = os.environ['APP_KEY'] self.datatxt = DataTXT() def test_nex(self): res = self.datatxt.nex('They say Apple is better than Windows') self.assertEqual( {annotation.uri for annotation in res.annotations}, {'http://en.wikipedia.org/wiki/Apple_Inc.', 'http://en.wikipedia.org/wiki/Microsoft_Windows'} ) def test_sim(self): res = self.datatxt.sim( 'Reports that the NSA eavesdropped on world leaders have "severely' ' shaken" relations between Europe and the U.S., German Chancellor' ' Angela Merkel said.', # -- 'Germany and France are to seek talks with the US to settle a row ' 'over spying, as espionage claims continue to overshadow an EU ' 'summit in Brussels.' ) self.assertGreater(res.similarity, 0.5) def test_li(self): res = self.datatxt.li("Le nostre tre M sono: mafia, mamma, mandolino") self.assertEqual( [entry.lang for entry in res.detectedLangs], ['it'] ) self.assertGreater(res.detectedLangs[0].confidence, 0.9999) def test_raises_on_error(self): with self.assertRaises(DandelionException): self.datatxt.nex(text=None) def test_can_set_host(self): self.datatxt = DataTXT(host="api.dandelion.eu") self.test_nex() self.datatxt = DataTXT(host="http://api.dandelion.eu") self.test_nex()
def get_seed_type(self, seed_name): app_id = configuration.APP_ID app_key = configuration.API_KEY_DANDELION datatxt = DataTXT(app_id=app_id, app_key=app_key) response = datatxt.nex(seed_name, **{ "min_confidence": 0.6, "include": ["types"] }) return response.annotations
def nerelEn(text): #text="voglio andare in bici. Che percorso mi consigliate?" translator = Translator() tr=translator.translate(text) text=tr.text datatxt = DataTXT(app_id='5cb879ebda544e2e95ce5cefd4963aca', app_key='5cb879ebda544e2e95ce5cefd4963aca') response = datatxt.nex(text, min_confidence=0.20, include_types=True, include_abstract=True, include_lod=True, include_categories=True) time = response['annotations'] #print(time) #entity = [] #print(time) index=0 categories=[] entity = [] types=[] lods=[] for index, row in enumerate(time): ca=[] ty=[] lo=[] name = time[index]['spot'] entity.append(name) try: categoria = time[index]['categories'] ca.append(categoria) for r in ca: for o in r: categories.append(o) except: print('categories not present') #categories.append("") try: typ = time[index]['types'] ty.append(typ) for r in ty: for o in r: types.append(o) except: print('types not present') #types.append("") try: lod = time[index]['lod']['dbpedia'] lo.append(lod) for r in lo: lods.append(r) except: print('lod not present') #print(lo) return (text,entity,categories,types,lods)
def nerel(text): datatxt = DataTXT(app_id='5cb879ebda544e2e95ce5cefd4963aca', app_key='5cb879ebda544e2e95ce5cefd4963aca') response = datatxt.nex(text, min_confidence=0.20, include_abstract=True, include_confidence=True, include_categories=True, include_image=True) time = response['annotations'] mostConfidence=0 #print(response) index=0 entity=[] abstracts=[] confidences=[] mostConf=0 mostimage="" categories=[] for index, row in enumerate(time): ca=[] name = time[index]['spot'] entity.append(name) try: abstract = time[index]['abstract'] abstracts.append(abstract) #print(abstract) except: print('abstract not present') abstracts.append("abstact not present") try: confidence = time[index]['confidence'] if confidence > mostConfidence: #print('ok') mostConfidence=confidence mostConf=name mostimage=time[index]['image']['thumbnail'] #print(confidence) confidences.append(confidence) except: print('confidence not present') confidences.append("") try: categoria = time[index]['categories'] ca.append(categoria) for r in ca: for o in r: #print(o) categories.append(o) except: print('categories not present') #categories.append("") return (entity, abstracts, confidences, categories, mostConf, mostimage)
def dandelion(item,tool_name): text = item["text"].encode('utf-8') dpaId = item["dpaId"] datatxt = DataTXT(app_id=token, app_key=token) response = datatxt.nex( text, include_categories=True, include_types=True, include_image=True, include_lod=True, include_alternate_labels=True, include_abstract=True) try: if response["lang"] != "de": output=[False,response] elif response["lang"] == "de": try: annotation=[] t=time.time() for entity in response.annotations: wiki= str(entity["id"]) uri = wiki_query(wiki) category = query_category(uri) surface = entity["spot"] start = entity["start"] end = entity["end"] label = entity["title"] insert_dict={ "start" : start, "end" : end, "label" : label, "surface" : surface, "uri" : uri, "category_tool" : "", "category" : category, "dpaid" : dpaId, "timestamp" : '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.utcfromtimestamp(t)), "tool" : tool_name } annotation.append(insert_dict) output=[True,annotation] # import IPython # IPython.embed() except KeyError: output= [KeyError,response] except KeyError: output= [KeyError,response] return output
class ChunksTest(unittest.TestCase): def setUp(self): # Retrieve all tweets tweets = list(mongo_manager.MongoManager(configuration.db_name).find("tweets", {}))[10:16] self.datatxt = DataTXT(app_id=configuration.APP1_ID, app_key=configuration.API_KEY_DANDELION1) self.t = tweets_chunk.TweetsChunk(tweets) def test_chunks(self): unique = self.t.get_unique_string() print(unique) response = self.datatxt.nex(self.t.get_unique_string(), **{"include": ["types", "categories", "abstract", "alternate_labels"], "social.hashtag": True, "social.mention": True}) print(response.annotations) self.t.split_annotation_each_tweet(response.annotations) print(self.t.index_tweet)
def run(self, tweets_chunks, app_id, app_key): datatxt = DataTXT(app_id=app_id, app_key=app_key) for tweets in tweets_chunks: join_tweets = tweets_chunk.TweetsChunk(tweets) pprint.pprint(len(tweets)) try: response = datatxt.nex( join_tweets.get_unique_string(), **{ "lang": tweets[0]["lang"], "include": [ "types", "categories", "abstract", "alternate_labels" ], "social.hashtag": True, "social.mention": True, "min_confidence": 0 }) # print(response) except DandelionException as e: logging.error(e.code, e.message) continue join_tweets.split_annotation_each_tweet(response.annotations) # pprint.pprint(join_tweets.index_tweet) for tweet in join_tweets.index_tweet: #seed_id = list(self.db_manager.find("seeds", {"handle": tweet["tweet"]["user"]["screen_name"], "id_experiment":self.id_experiment})) #if(len(seed_id)>0): # seed_id=seed_id[0]["_id"] #else: # pprint.pprint(tweet["tweet"]["user"]["screen_name"]) # continue seed_id = tweet["tweet"]["seed"] for annotation in tweet["annotations"]: annotation["tweet"] = tweet["tweet"]["_id"] annotation["seed"] = seed_id annotation["concrete_types"] = self.find_concrete_type( annotation["types"], self.ontology) annotation["id_experiment"] = self.id_experiment #print(annotation) self.db_manager.write_mongo("entity", annotation)
class DandelionEntityExtractor(EntityExtractor): # http://mappings.dbpedia.org/server/ontology/classes/ __dbpedia_type_to_entity_type = { 'http://dbpedia.org/ontology/Person': EntityType.PERSON, 'http://dbpedia.org/ontology/Place': EntityType.PLACE, 'http://dbpedia.org/ontology/Organisation': EntityType.GROUP, 'http://dbpedia.org/ontology/Group': EntityType.GROUP, 'http://dbpedia.org/ontology/Event': EntityType.EVENT, 'http://dbpedia.org/ontology/TimePeriod': EntityType.DATE, 'http://dbpedia.org/ontology/Activity': EntityType.ACTIVITY, 'http://dbpedia.org/ontology/Work': EntityType.MANMADEOBJECT } def __init__(self): token = os.environ.get('DANDELION_TOKEN') if token is None: raise Exception( 'Environment variable "DANDELION_TOKEN" must be set') self.__datatxt = DataTXT(token=token) def extract_entities(self, text): response = self.__datatxt.nex(text, include_types=True) return self.__convert_entities(response.annotations) def __convert_entities(self, annotations): converted_entities = [] for annotation in annotations: entity_type = self.__convert_types(annotation.types) converted_entity = Entity(annotation.label, entity_type, annotation.start, annotation.end) converted_entities.append(converted_entity) return converted_entities def __convert_types(self, types): entity_type = EntityType.THING if len(types) > 0: for t in types: if t in DandelionEntityExtractor.__dbpedia_type_to_entity_type: entity_type = DandelionEntityExtractor.__dbpedia_type_to_entity_type[ t] break return entity_type
def get_entities(self, text, lang='en', min_confidence=0.7, include='types, lod'): """ Dato un testo recupera le entità. :param text: rappresenta il testo da cui vogliamo estrarre le entità :param lang: indica la lingua in cui è scritto il testo :param min_confidence: indica il valore minimo affinchè l'entità estratta venga restituita :param include: consente di specificare dei parametri per ottenere più informazioni dalle API di Dandelion. In particolare: - type: consente di aggiungere informazioni sul tipo (tassonomia) dell'entità estratta attravero una lista di link a DBpedia. Se lang='en' vengono restituiti link relativi a DBpedia English. - lod: aggiunge link relativi alle equivalenti entità presenti in DBpedia. :return: la lista di entità estratte dal documento """ entities = [] self.validate_token() datatxt = DataTXT(token=self._tokenList[self._indexToken]) annotations = datatxt.nex( text, lang=lang, min_confidence=min_confidence, include=include ).annotations for annotation in annotations: entities.append({ 'title': annotation.title, 'wikipediaURI': annotation.lod.wikipedia, 'dbpediaURI': annotation.lod.dbpedia, 'types': annotation.types }) self._requests = self._requests + 1 return entities
# Loading data dr_tr = DataReader('./Data/olid-training-v1.tsv', 'A') data_tr, labels_tr = dr_tr.get_labelled_data() dr_tst = DataReader('./Data/testset-levela.tsv', 'A') data_tst, label_tst = dr_tst.get_test_data() data_tr = data_tr[:] data_tst = data_tst[:] entities_tr = [] entities_tst = [] # Entity extraction using dandelion for line in tqdm.tqdm(data_tr): temp = [] for annotation in datatxt.nex(normalizeTweet(line), lang='en').annotations: temp.append(annotation.title) entities_tr.append(temp) for line in tqdm.tqdm(data_tst): temp = [] for annotation in datatxt.nex(normalizeTweet(line), lang='en').annotations: temp.append(annotation.title) entities_tst.append(temp) # Saving to files with open('./pickles/dande_train.pkl', 'wb') as f: cPickle.dump(entities_tr, f) with open('./pickles/dande_test.pkl', 'wb') as f: cPickle.dump(entities_tst, f)
def AnalyseText(text): datatxt = DataTXT(app_id='cd32413268454e19a31776d33b5f0ba0', app_key='cd32413268454e19a31776d33b5f0ba0') response = datatxt.nex(text, include="categories") return response.annotations
def get_entities_from_dandelion(text): # TODO: mettere le keys in un file di setting datatxt = DataTXT(app_id='7c418708', app_key='0043c60be84a1f471184a192fe06e540') result = datatxt.nex(text, include_lod=True, language='en') return result
def get_annotation_dandelion(text): datatxt = DataTXT(app_id='0b2b87bc', app_key='7f0ae25400535758e9ceae358b3db763') result =datatxt.nex(text.decode('latin-1'),include_lod=True,language='en') pprint(result)
close_match = '' if closeMatch: for cl in closeMatch[0]: if 'it.dbpedia.org' in cl: close_match = cl text = u'' if definition or scopeNote: wiki_match = '' text = u'{name} {definition} {scopeNote}'.format( name=name, definition=definition, scopeNote=scopeNote) try: annotations = datatxt.nex(text, lang='it') except dandelion.base.DandelionException as e: if e.message == u'usage limits are exceeded': print 'DataTXT daily usage limits met, exiting.' exit(0) else: import pdb pdb.set_trace() try: annlist = annotations['annotations'] for ann in annlist: start = ann['start'] end = ann['end'] if start == 0: if end == len(name):