Ejemplo n.º 1
0
class DandelionAnnotator:
    def __init__(self, app_id, app_key):
        self.app_id = app_id
        self.app_key = app_key
        self.datatxt = DataTXT(app_id=self.app_id, app_key=self.app_key)

    def dandelion_annotation(self, string):
        """
        Gets a string, annotates it, and returns the annotated version with the entities inside
        :param string:
        :return:
        """

        response = self.datatxt.nex(string, include_lod=True)

        annotated_string = string

        shift = 0
        for annotation in response.annotations:
            start = annotation["start"]
            end = annotation["end"]
            print(shift)
            annotated_string = annotated_string[:start +
                                                shift] + replace_dbpedia(
                                                    annotation["lod"].dbpedia
                                                ) + annotated_string[shift +
                                                                     end:]
            print(annotated_string)
            shift = shift + len(replace_dbpedia(annotation["lod"].dbpedia)) - (
                annotation["end"] - annotation["start"])

        return annotated_string
Ejemplo n.º 2
0
def saveAnnotation(id,text,db):
    print id
    if db.simDoc.find({'_id':id}).count() == 0:
        
                #print entity.id, entity.relevance_score, entity.confidence_score, entity.freebase_types, entity.wikidata_id
        wikidataEntities,dbpediaEntities=get_annotation_text_razor(text)
        
        datatxt = DataTXT(app_id='0b2b87bc', app_key='7f0ae25400535758e9ceae358b3db763')

        result =datatxt.nex(text.decode('latin-1'),include_lod=True,language='en')['annotations']
        #pprint(result)
        entityDbpediaSet=set()
        entityDbpedia=[]
        print result
        for entity in result:
            print entity
            if 'lod' in entity and 'dbpedia' in entity['lod'] and entity['lod']['dbpedia'] not in entityDbpediaSet:
                entityDbpedia.append({'dbpedia_id':entity['lod']['dbpedia'],
                                      'confidence':entity['confidence']})
                entityDbpediaSet.add(entity['lod']['dbpedia'])
        
        #entitySetWikidata=set(map(lambda x: x['lod']['wikidata'],result))
        #pprint(entitySetDbpedia)
        print "dbpedia %s wikidata %s"%(len(entityDbpedia),len(wikidataEntities))
        db.simDoc.insert({'_id':id,'text':text.decode('utf-8','ignore'),
                          'entities_dbpedia':entityDbpedia,
                          'entities_wikidata':wikidataEntities,
                          'entities_dbpedia_razor':dbpediaEntities})
Ejemplo n.º 3
0
class TestDatatxt(TestCase):
    def setUp(self):
        default_config['app_id'] = os.environ['APP_ID']
        default_config['app_key'] = os.environ['APP_KEY']
        self.datatxt = DataTXT()

    def test_nex(self):
        res = self.datatxt.nex('They say Apple is better than Windows')
        self.assertEqual(
            {annotation.uri for annotation in res.annotations},
            {'http://en.wikipedia.org/wiki/Apple_Inc.',
             'http://en.wikipedia.org/wiki/Microsoft_Windows'}
        )

    def test_sim(self):
        res = self.datatxt.sim(
            'Reports that the NSA eavesdropped on world leaders have "severely'
            ' shaken" relations between Europe and the U.S., German Chancellor'
            ' Angela Merkel said.',
            # --
            'Germany and France are to seek talks with the US to settle a row '
            'over spying, as espionage claims continue to overshadow an EU '
            'summit in Brussels.'
        )

        self.assertGreater(res.similarity, 0.5)

    def test_li(self):
        res = self.datatxt.li("Le nostre tre M sono: mafia, mamma, mandolino")

        self.assertEqual(
            [entry.lang for entry in res.detectedLangs],
            ['it']
        )

        self.assertGreater(res.detectedLangs[0].confidence, 0.9999)

    def test_raises_on_error(self):
        with self.assertRaises(DandelionException):
            self.datatxt.nex(text=None)

    def test_can_set_host(self):
        self.datatxt = DataTXT(host="api.dandelion.eu")
        self.test_nex()

        self.datatxt = DataTXT(host="http://api.dandelion.eu")
        self.test_nex()
 def get_seed_type(self, seed_name):
     app_id = configuration.APP_ID
     app_key = configuration.API_KEY_DANDELION
     datatxt = DataTXT(app_id=app_id, app_key=app_key)
     response = datatxt.nex(seed_name, **{
         "min_confidence": 0.6,
         "include": ["types"]
     })
     return response.annotations
Ejemplo n.º 5
0
def nerelEn(text):
    #text="voglio andare in bici. Che percorso mi consigliate?"
    translator = Translator()
    tr=translator.translate(text)
    text=tr.text
    datatxt = DataTXT(app_id='5cb879ebda544e2e95ce5cefd4963aca', app_key='5cb879ebda544e2e95ce5cefd4963aca')
    response = datatxt.nex(text, min_confidence=0.20, include_types=True, include_abstract=True, include_lod=True, include_categories=True)
    time = response['annotations']
    #print(time)
    #entity = []
    #print(time)
    index=0
    categories=[]
    entity = [] 
    types=[]
    lods=[]
    for index, row in enumerate(time):

        ca=[]
        ty=[]
        lo=[]
        name = time[index]['spot']
        entity.append(name)
        try:
            categoria = time[index]['categories']
            ca.append(categoria)
            for r in ca:
                for o in r:
                    categories.append(o)
        except:
            print('categories not present')
            #categories.append("")

        try:
            typ = time[index]['types']
            ty.append(typ)
            for r in ty:
                for o in r:
                    types.append(o)
        except:
            print('types not present')
            #types.append("")
        try:
            lod = time[index]['lod']['dbpedia']
            lo.append(lod)
            for r in lo:
                    lods.append(r)
        except:
            print('lod not present')

        #print(lo)
    return (text,entity,categories,types,lods)
Ejemplo n.º 6
0
def nerel(text):
    datatxt = DataTXT(app_id='5cb879ebda544e2e95ce5cefd4963aca', app_key='5cb879ebda544e2e95ce5cefd4963aca')
    response = datatxt.nex(text, min_confidence=0.20, include_abstract=True, include_confidence=True, include_categories=True, include_image=True)
    time = response['annotations']

    mostConfidence=0
    #print(response)
    index=0
    entity=[]
    abstracts=[]
    confidences=[]
    mostConf=0
    mostimage=""
    categories=[]
    for index, row in enumerate(time):

        ca=[]
        name = time[index]['spot']
        entity.append(name)
        try:
            abstract = time[index]['abstract']
            abstracts.append(abstract)
            #print(abstract)
        except:
            print('abstract not present')
            abstracts.append("abstact not present")
        try:
            confidence = time[index]['confidence']
            if confidence > mostConfidence:
                #print('ok')
                mostConfidence=confidence
                mostConf=name
                mostimage=time[index]['image']['thumbnail']
            #print(confidence)
            confidences.append(confidence)
        except:
            print('confidence not present')
            confidences.append("")
        try:
            categoria = time[index]['categories']
            ca.append(categoria)
            for r in ca:
                for o in r:
                    #print(o)
                    categories.append(o)
        except:
            print('categories not present')
            #categories.append("")


    return (entity, abstracts, confidences, categories, mostConf, mostimage)
Ejemplo n.º 7
0
def dandelion(item,tool_name):
    
    text = item["text"].encode('utf-8')
    dpaId = item["dpaId"]
    
    datatxt = DataTXT(app_id=token, app_key=token)
    response = datatxt.nex(
        text,
        include_categories=True,
        include_types=True,
        include_image=True,
        include_lod=True,
        include_alternate_labels=True,
        include_abstract=True)
    try:
        if response["lang"] != "de":
            output=[False,response]
        elif response["lang"] == "de":
            try:
                annotation=[]
                t=time.time()
                for entity in response.annotations:
                    wiki= str(entity["id"])
                    uri = wiki_query(wiki)
                    category = query_category(uri)
                    surface = entity["spot"]
                    start = entity["start"]
                    end = entity["end"]
                    label = entity["title"]
                    insert_dict={
                        "start" : start,
                        "end" : end,
                        "label" : label,
                        "surface" : surface,
                        "uri" : uri,
                        "category_tool" : "",
                        "category" : category,
                        "dpaid" : dpaId,
                        "timestamp" : '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.utcfromtimestamp(t)),
                        "tool" : tool_name
                        }
                    annotation.append(insert_dict)
                output=[True,annotation]
                # import IPython
                # IPython.embed()
            except KeyError:
                output= [KeyError,response]
    except KeyError:
        output= [KeyError,response]

    return output
Ejemplo n.º 8
0
class ChunksTest(unittest.TestCase):
    def setUp(self):
        # Retrieve all tweets
        tweets = list(mongo_manager.MongoManager(configuration.db_name).find("tweets", {}))[10:16]
        self.datatxt = DataTXT(app_id=configuration.APP1_ID, app_key=configuration.API_KEY_DANDELION1)
        self.t = tweets_chunk.TweetsChunk(tweets)

    def test_chunks(self):
        unique = self.t.get_unique_string()
        print(unique)
        response = self.datatxt.nex(self.t.get_unique_string(),
                                    **{"include": ["types", "categories", "abstract", "alternate_labels"],
                                       "social.hashtag": True, "social.mention": True})
        print(response.annotations)
        self.t.split_annotation_each_tweet(response.annotations)
        print(self.t.index_tweet)
    def run(self, tweets_chunks, app_id, app_key):
        datatxt = DataTXT(app_id=app_id, app_key=app_key)
        for tweets in tweets_chunks:
            join_tweets = tweets_chunk.TweetsChunk(tweets)
            pprint.pprint(len(tweets))
            try:
                response = datatxt.nex(
                    join_tweets.get_unique_string(), **{
                        "lang":
                        tweets[0]["lang"],
                        "include": [
                            "types", "categories", "abstract",
                            "alternate_labels"
                        ],
                        "social.hashtag":
                        True,
                        "social.mention":
                        True,
                        "min_confidence":
                        0
                    })
                # print(response)
            except DandelionException as e:
                logging.error(e.code, e.message)
                continue
            join_tweets.split_annotation_each_tweet(response.annotations)
            # pprint.pprint(join_tweets.index_tweet)
            for tweet in join_tweets.index_tweet:
                #seed_id = list(self.db_manager.find("seeds", {"handle": tweet["tweet"]["user"]["screen_name"], "id_experiment":self.id_experiment}))
                #if(len(seed_id)>0):
                #        seed_id=seed_id[0]["_id"]
                #else:
                #    pprint.pprint(tweet["tweet"]["user"]["screen_name"])
                #    continue

                seed_id = tweet["tweet"]["seed"]
                for annotation in tweet["annotations"]:
                    annotation["tweet"] = tweet["tweet"]["_id"]
                    annotation["seed"] = seed_id
                    annotation["concrete_types"] = self.find_concrete_type(
                        annotation["types"], self.ontology)
                    annotation["id_experiment"] = self.id_experiment
                    #print(annotation)
                    self.db_manager.write_mongo("entity", annotation)
Ejemplo n.º 10
0
class DandelionEntityExtractor(EntityExtractor):

    # http://mappings.dbpedia.org/server/ontology/classes/
    __dbpedia_type_to_entity_type = {
        'http://dbpedia.org/ontology/Person': EntityType.PERSON,
        'http://dbpedia.org/ontology/Place': EntityType.PLACE,
        'http://dbpedia.org/ontology/Organisation': EntityType.GROUP,
        'http://dbpedia.org/ontology/Group': EntityType.GROUP,
        'http://dbpedia.org/ontology/Event': EntityType.EVENT,
        'http://dbpedia.org/ontology/TimePeriod': EntityType.DATE,
        'http://dbpedia.org/ontology/Activity': EntityType.ACTIVITY,
        'http://dbpedia.org/ontology/Work': EntityType.MANMADEOBJECT
    }

    def __init__(self):
        token = os.environ.get('DANDELION_TOKEN')
        if token is None:
            raise Exception(
                'Environment variable "DANDELION_TOKEN" must be set')
        self.__datatxt = DataTXT(token=token)

    def extract_entities(self, text):
        response = self.__datatxt.nex(text, include_types=True)
        return self.__convert_entities(response.annotations)

    def __convert_entities(self, annotations):
        converted_entities = []
        for annotation in annotations:
            entity_type = self.__convert_types(annotation.types)
            converted_entity = Entity(annotation.label, entity_type,
                                      annotation.start, annotation.end)
            converted_entities.append(converted_entity)
        return converted_entities

    def __convert_types(self, types):
        entity_type = EntityType.THING
        if len(types) > 0:
            for t in types:
                if t in DandelionEntityExtractor.__dbpedia_type_to_entity_type:
                    entity_type = DandelionEntityExtractor.__dbpedia_type_to_entity_type[
                        t]
                    break
        return entity_type
Ejemplo n.º 11
0
    def get_entities(self, text, lang='en', min_confidence=0.7, include='types, lod'):
        """
        Dato un testo recupera le entità.

        :param text: rappresenta il testo da cui vogliamo estrarre le entità
        :param lang: indica la lingua in cui è scritto il testo
        :param min_confidence: indica il valore minimo affinchè l'entità estratta venga restituita
        :param include: consente di specificare dei parametri per ottenere più informazioni dalle API di Dandelion.
        In particolare:
            - type: consente di aggiungere informazioni sul tipo (tassonomia) dell'entità estratta attravero una lista
            di link a DBpedia. Se lang='en' vengono restituiti link relativi a DBpedia English.
            - lod: aggiunge link relativi alle equivalenti entità presenti in DBpedia.
        :return: la lista di entità estratte dal documento
        """

        entities = []
        self.validate_token()

        datatxt = DataTXT(token=self._tokenList[self._indexToken])
        annotations = datatxt.nex(
            text,
            lang=lang,
            min_confidence=min_confidence,
            include=include
        ).annotations

        for annotation in annotations:
            entities.append({
                'title': annotation.title,
                'wikipediaURI': annotation.lod.wikipedia,
                'dbpediaURI': annotation.lod.dbpedia,
                'types': annotation.types
            })

        self._requests = self._requests + 1

        return entities
# Loading data
dr_tr = DataReader('./Data/olid-training-v1.tsv', 'A')
data_tr, labels_tr = dr_tr.get_labelled_data()
dr_tst = DataReader('./Data/testset-levela.tsv', 'A')
data_tst, label_tst = dr_tst.get_test_data()

data_tr = data_tr[:]
data_tst = data_tst[:]

entities_tr = []
entities_tst = []

# Entity extraction using dandelion
for line in tqdm.tqdm(data_tr):
    temp = []
    for annotation in datatxt.nex(normalizeTweet(line), lang='en').annotations:
        temp.append(annotation.title)
    entities_tr.append(temp)

for line in tqdm.tqdm(data_tst):
    temp = []
    for annotation in datatxt.nex(normalizeTweet(line), lang='en').annotations:
        temp.append(annotation.title)
    entities_tst.append(temp)

# Saving to files
with open('./pickles/dande_train.pkl', 'wb') as f:
    cPickle.dump(entities_tr, f)
with open('./pickles/dande_test.pkl', 'wb') as f:
    cPickle.dump(entities_tst, f)
Ejemplo n.º 13
0
def AnalyseText(text):
    datatxt = DataTXT(app_id='cd32413268454e19a31776d33b5f0ba0',
                      app_key='cd32413268454e19a31776d33b5f0ba0')
    response = datatxt.nex(text, include="categories")

    return response.annotations
Ejemplo n.º 14
0
def get_entities_from_dandelion(text):
    # TODO: mettere le keys in un file di setting
    datatxt = DataTXT(app_id='7c418708', app_key='0043c60be84a1f471184a192fe06e540')
    result = datatxt.nex(text, include_lod=True, language='en')
    return result
Ejemplo n.º 15
0
def get_annotation_dandelion(text):
    datatxt = DataTXT(app_id='0b2b87bc', app_key='7f0ae25400535758e9ceae358b3db763')

    result =datatxt.nex(text.decode('latin-1'),include_lod=True,language='en')
    
    pprint(result)
Ejemplo n.º 16
0
        close_match = ''
        if closeMatch:
            for cl in closeMatch[0]:
                if 'it.dbpedia.org' in cl:
                    close_match = cl

        text = u''
        if definition or scopeNote:
            wiki_match = ''
            text = u'{name} {definition} {scopeNote}'.format(
                name=name,
                definition=definition,
                scopeNote=scopeNote)

            try:
                annotations = datatxt.nex(text, lang='it')
            except dandelion.base.DandelionException as e:
                if e.message == u'usage limits are exceeded':
                    print 'DataTXT daily usage limits met, exiting.'
                    exit(0)
                else:
                    import pdb
                    pdb.set_trace()

            try:
                annlist = annotations['annotations']
                for ann in annlist:
                    start = ann['start']
                    end = ann['end']
                    if start == 0:
                        if end == len(name):
Ejemplo n.º 17
0
def get_entities_from_dandelion(text):
    # TODO: mettere le keys in un file di setting
    datatxt = DataTXT(app_id='7c418708',
                      app_key='0043c60be84a1f471184a192fe06e540')
    result = datatxt.nex(text, include_lod=True, language='en')
    return result