コード例 #1
0
ファイル: save_documents_mongodb.py プロジェクト: linkTDP/CSA
def saveAnnotation(id,text,db):
    print id
    if db.simDoc.find({'_id':id}).count() == 0:
        
                #print entity.id, entity.relevance_score, entity.confidence_score, entity.freebase_types, entity.wikidata_id
        wikidataEntities,dbpediaEntities=get_annotation_text_razor(text)
        
        datatxt = DataTXT(app_id='0b2b87bc', app_key='7f0ae25400535758e9ceae358b3db763')

        result =datatxt.nex(text.decode('latin-1'),include_lod=True,language='en')['annotations']
        #pprint(result)
        entityDbpediaSet=set()
        entityDbpedia=[]
        print result
        for entity in result:
            print entity
            if 'lod' in entity and 'dbpedia' in entity['lod'] and entity['lod']['dbpedia'] not in entityDbpediaSet:
                entityDbpedia.append({'dbpedia_id':entity['lod']['dbpedia'],
                                      'confidence':entity['confidence']})
                entityDbpediaSet.add(entity['lod']['dbpedia'])
        
        #entitySetWikidata=set(map(lambda x: x['lod']['wikidata'],result))
        #pprint(entitySetDbpedia)
        print "dbpedia %s wikidata %s"%(len(entityDbpedia),len(wikidataEntities))
        db.simDoc.insert({'_id':id,'text':text.decode('utf-8','ignore'),
                          'entities_dbpedia':entityDbpedia,
                          'entities_wikidata':wikidataEntities,
                          'entities_dbpedia_razor':dbpediaEntities})
コード例 #2
0
def tok1(msg):
    lis = []
    li = []
    datatxt = DataTXT(app_id='5d504312af124377bac2f69c908dc20b',
                      app_key='5d504312af124377bac2f69c908dc20b')
    repnews = [
        'news.google.co.in', 'nytimes.com', 'timesofindia.indiatimes.com',
        'wsj.com', 'washingtonpost.com', 'bbc.com', 'moneycontrol.com',
        'economist.com', 'newyorker.com', 'economictimes.indiatimes.com',
        'ndtv.com', 'indiatoday.in', 'indianexpress.com', 'thehindu.com',
        'news18.com', 'firstpost.com', 'dnaindia.com', 'apnews.com',
        'brief.news', 'npr.org', 'scroll.in', 'reuters.com'
    ]
    tokenizer = RegexpTokenizer(r'\w+')
    a = tokenizer.tokenize(msg)
    stop = stopwords.words('english') + list(string.punctuation)
    a = [i for i in a if i not in stop]
    er = EventRegistry(apiKey="e010e4f7-343c-49d5-893d-63d4c2cfd487")
    q = QueryArticlesIter(keywords=QueryItems.OR(a),
                          lang=["eng"],
                          keywordsLoc="title")
    b = q.execQuery(er, sortBy="rel", maxItems=1)
    for article in b:
        if (article['source']['uri'] in repnews):
            if article['title'] not in li:
                lis.append(article['title'])
    for i in range(len(lis)):
        a = datatxt.sim(msg, lis[i])
        if a['similarity'] >= 0.60:
            print(a['similarity'])
            li.append(lis[i])
    return (li)
コード例 #3
0
 def get_seed_type(self, seed_name):
     app_id = configuration.APP_ID
     app_key = configuration.API_KEY_DANDELION
     datatxt = DataTXT(app_id=app_id, app_key=app_key)
     response = datatxt.nex(seed_name, **{
         "min_confidence": 0.6,
         "include": ["types"]
     })
     return response.annotations
コード例 #4
0
def nerelEn(text):
    #text="voglio andare in bici. Che percorso mi consigliate?"
    translator = Translator()
    tr=translator.translate(text)
    text=tr.text
    datatxt = DataTXT(app_id='5cb879ebda544e2e95ce5cefd4963aca', app_key='5cb879ebda544e2e95ce5cefd4963aca')
    response = datatxt.nex(text, min_confidence=0.20, include_types=True, include_abstract=True, include_lod=True, include_categories=True)
    time = response['annotations']
    #print(time)
    #entity = []
    #print(time)
    index=0
    categories=[]
    entity = [] 
    types=[]
    lods=[]
    for index, row in enumerate(time):

        ca=[]
        ty=[]
        lo=[]
        name = time[index]['spot']
        entity.append(name)
        try:
            categoria = time[index]['categories']
            ca.append(categoria)
            for r in ca:
                for o in r:
                    categories.append(o)
        except:
            print('categories not present')
            #categories.append("")

        try:
            typ = time[index]['types']
            ty.append(typ)
            for r in ty:
                for o in r:
                    types.append(o)
        except:
            print('types not present')
            #types.append("")
        try:
            lod = time[index]['lod']['dbpedia']
            lo.append(lod)
            for r in lo:
                    lods.append(r)
        except:
            print('lod not present')

        #print(lo)
    return (text,entity,categories,types,lods)
コード例 #5
0
def dandelion(item,tool_name):
    
    text = item["text"].encode('utf-8')
    dpaId = item["dpaId"]
    
    datatxt = DataTXT(app_id=token, app_key=token)
    response = datatxt.nex(
        text,
        include_categories=True,
        include_types=True,
        include_image=True,
        include_lod=True,
        include_alternate_labels=True,
        include_abstract=True)
    try:
        if response["lang"] != "de":
            output=[False,response]
        elif response["lang"] == "de":
            try:
                annotation=[]
                t=time.time()
                for entity in response.annotations:
                    wiki= str(entity["id"])
                    uri = wiki_query(wiki)
                    category = query_category(uri)
                    surface = entity["spot"]
                    start = entity["start"]
                    end = entity["end"]
                    label = entity["title"]
                    insert_dict={
                        "start" : start,
                        "end" : end,
                        "label" : label,
                        "surface" : surface,
                        "uri" : uri,
                        "category_tool" : "",
                        "category" : category,
                        "dpaid" : dpaId,
                        "timestamp" : '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.utcfromtimestamp(t)),
                        "tool" : tool_name
                        }
                    annotation.append(insert_dict)
                output=[True,annotation]
                # import IPython
                # IPython.embed()
            except KeyError:
                output= [KeyError,response]
    except KeyError:
        output= [KeyError,response]

    return output
コード例 #6
0
def nerel(text):
    datatxt = DataTXT(app_id='5cb879ebda544e2e95ce5cefd4963aca', app_key='5cb879ebda544e2e95ce5cefd4963aca')
    response = datatxt.nex(text, min_confidence=0.20, include_abstract=True, include_confidence=True, include_categories=True, include_image=True)
    time = response['annotations']

    mostConfidence=0
    #print(response)
    index=0
    entity=[]
    abstracts=[]
    confidences=[]
    mostConf=0
    mostimage=""
    categories=[]
    for index, row in enumerate(time):

        ca=[]
        name = time[index]['spot']
        entity.append(name)
        try:
            abstract = time[index]['abstract']
            abstracts.append(abstract)
            #print(abstract)
        except:
            print('abstract not present')
            abstracts.append("abstact not present")
        try:
            confidence = time[index]['confidence']
            if confidence > mostConfidence:
                #print('ok')
                mostConfidence=confidence
                mostConf=name
                mostimage=time[index]['image']['thumbnail']
            #print(confidence)
            confidences.append(confidence)
        except:
            print('confidence not present')
            confidences.append("")
        try:
            categoria = time[index]['categories']
            ca.append(categoria)
            for r in ca:
                for o in r:
                    #print(o)
                    categories.append(o)
        except:
            print('categories not present')
            #categories.append("")


    return (entity, abstracts, confidences, categories, mostConf, mostimage)
コード例 #7
0
class DandelionAnnotator:
    def __init__(self, app_id, app_key):
        self.app_id = app_id
        self.app_key = app_key
        self.datatxt = DataTXT(app_id=self.app_id, app_key=self.app_key)

    def dandelion_annotation(self, string):
        """
        Gets a string, annotates it, and returns the annotated version with the entities inside
        :param string:
        :return:
        """

        response = self.datatxt.nex(string, include_lod=True)

        annotated_string = string

        shift = 0
        for annotation in response.annotations:
            start = annotation["start"]
            end = annotation["end"]
            print(shift)
            annotated_string = annotated_string[:start +
                                                shift] + replace_dbpedia(
                                                    annotation["lod"].dbpedia
                                                ) + annotated_string[shift +
                                                                     end:]
            print(annotated_string)
            shift = shift + len(replace_dbpedia(annotation["lod"].dbpedia)) - (
                annotation["end"] - annotation["start"])

        return annotated_string
コード例 #8
0
class TestDatatxt(TestCase):
    def setUp(self):
        default_config['app_id'] = os.environ['APP_ID']
        default_config['app_key'] = os.environ['APP_KEY']
        self.datatxt = DataTXT()

    def test_nex(self):
        res = self.datatxt.nex('They say Apple is better than Windows')
        self.assertEqual(
            {annotation.uri for annotation in res.annotations},
            {'http://en.wikipedia.org/wiki/Apple_Inc.',
             'http://en.wikipedia.org/wiki/Microsoft_Windows'}
        )

    def test_sim(self):
        res = self.datatxt.sim(
            'Reports that the NSA eavesdropped on world leaders have "severely'
            ' shaken" relations between Europe and the U.S., German Chancellor'
            ' Angela Merkel said.',
            # --
            'Germany and France are to seek talks with the US to settle a row '
            'over spying, as espionage claims continue to overshadow an EU '
            'summit in Brussels.'
        )

        self.assertGreater(res.similarity, 0.5)

    def test_li(self):
        res = self.datatxt.li("Le nostre tre M sono: mafia, mamma, mandolino")

        self.assertEqual(
            [entry.lang for entry in res.detectedLangs],
            ['it']
        )

        self.assertGreater(res.detectedLangs[0].confidence, 0.9999)

    def test_raises_on_error(self):
        with self.assertRaises(DandelionException):
            self.datatxt.nex(text=None)

    def test_can_set_host(self):
        self.datatxt = DataTXT(host="api.dandelion.eu")
        self.test_nex()

        self.datatxt = DataTXT(host="http://api.dandelion.eu")
        self.test_nex()
コード例 #9
0
ファイル: base.py プロジェクト: suzynerd/dandelionAPI
    def test_can_authenticate(self):
        with self.assertRaises(DandelionException) as context:
            Datagem('administrative-regions')
        self.assertEqual(
            context.exception.message, 'Param "app_id" is required'
        )

        with self.assertRaises(DandelionException) as context:
            DataTXT()
        self.assertEqual(
            context.exception.message, 'Param "app_id" is required'
        )

        default_config['app_id'] = os.environ['APP_ID']
        default_config['app_key'] = os.environ['APP_KEY']

        Datagem('administrative-regions')
        DataTXT()
コード例 #10
0
    def run(self, tweets_chunks, app_id, app_key):
        datatxt = DataTXT(app_id=app_id, app_key=app_key)
        for tweets in tweets_chunks:
            join_tweets = tweets_chunk.TweetsChunk(tweets)
            pprint.pprint(len(tweets))
            try:
                response = datatxt.nex(
                    join_tweets.get_unique_string(), **{
                        "lang":
                        tweets[0]["lang"],
                        "include": [
                            "types", "categories", "abstract",
                            "alternate_labels"
                        ],
                        "social.hashtag":
                        True,
                        "social.mention":
                        True,
                        "min_confidence":
                        0
                    })
                # print(response)
            except DandelionException as e:
                logging.error(e.code, e.message)
                continue
            join_tweets.split_annotation_each_tweet(response.annotations)
            # pprint.pprint(join_tweets.index_tweet)
            for tweet in join_tweets.index_tweet:
                #seed_id = list(self.db_manager.find("seeds", {"handle": tweet["tweet"]["user"]["screen_name"], "id_experiment":self.id_experiment}))
                #if(len(seed_id)>0):
                #        seed_id=seed_id[0]["_id"]
                #else:
                #    pprint.pprint(tweet["tweet"]["user"]["screen_name"])
                #    continue

                seed_id = tweet["tweet"]["seed"]
                for annotation in tweet["annotations"]:
                    annotation["tweet"] = tweet["tweet"]["_id"]
                    annotation["seed"] = seed_id
                    annotation["concrete_types"] = self.find_concrete_type(
                        annotation["types"], self.ontology)
                    annotation["id_experiment"] = self.id_experiment
                    #print(annotation)
                    self.db_manager.write_mongo("entity", annotation)
コード例 #11
0
    def get_entities(self, text, lang='en', min_confidence=0.7, include='types, lod'):
        """
        Dato un testo recupera le entità.

        :param text: rappresenta il testo da cui vogliamo estrarre le entità
        :param lang: indica la lingua in cui è scritto il testo
        :param min_confidence: indica il valore minimo affinchè l'entità estratta venga restituita
        :param include: consente di specificare dei parametri per ottenere più informazioni dalle API di Dandelion.
        In particolare:
            - type: consente di aggiungere informazioni sul tipo (tassonomia) dell'entità estratta attravero una lista
            di link a DBpedia. Se lang='en' vengono restituiti link relativi a DBpedia English.
            - lod: aggiunge link relativi alle equivalenti entità presenti in DBpedia.
        :return: la lista di entità estratte dal documento
        """

        entities = []
        self.validate_token()

        datatxt = DataTXT(token=self._tokenList[self._indexToken])
        annotations = datatxt.nex(
            text,
            lang=lang,
            min_confidence=min_confidence,
            include=include
        ).annotations

        for annotation in annotations:
            entities.append({
                'title': annotation.title,
                'wikipediaURI': annotation.lod.wikipedia,
                'dbpediaURI': annotation.lod.dbpedia,
                'types': annotation.types
            })

        self._requests = self._requests + 1

        return entities
コード例 #12
0
class ChunksTest(unittest.TestCase):
    def setUp(self):
        # Retrieve all tweets
        tweets = list(mongo_manager.MongoManager(configuration.db_name).find("tweets", {}))[10:16]
        self.datatxt = DataTXT(app_id=configuration.APP1_ID, app_key=configuration.API_KEY_DANDELION1)
        self.t = tweets_chunk.TweetsChunk(tweets)

    def test_chunks(self):
        unique = self.t.get_unique_string()
        print(unique)
        response = self.datatxt.nex(self.t.get_unique_string(),
                                    **{"include": ["types", "categories", "abstract", "alternate_labels"],
                                       "social.hashtag": True, "social.mention": True})
        print(response.annotations)
        self.t.split_annotation_each_tweet(response.annotations)
        print(self.t.index_tweet)
コード例 #13
0
class DandelionEntityExtractor(EntityExtractor):

    # http://mappings.dbpedia.org/server/ontology/classes/
    __dbpedia_type_to_entity_type = {
        'http://dbpedia.org/ontology/Person': EntityType.PERSON,
        'http://dbpedia.org/ontology/Place': EntityType.PLACE,
        'http://dbpedia.org/ontology/Organisation': EntityType.GROUP,
        'http://dbpedia.org/ontology/Group': EntityType.GROUP,
        'http://dbpedia.org/ontology/Event': EntityType.EVENT,
        'http://dbpedia.org/ontology/TimePeriod': EntityType.DATE,
        'http://dbpedia.org/ontology/Activity': EntityType.ACTIVITY,
        'http://dbpedia.org/ontology/Work': EntityType.MANMADEOBJECT
    }

    def __init__(self):
        token = os.environ.get('DANDELION_TOKEN')
        if token is None:
            raise Exception(
                'Environment variable "DANDELION_TOKEN" must be set')
        self.__datatxt = DataTXT(token=token)

    def extract_entities(self, text):
        response = self.__datatxt.nex(text, include_types=True)
        return self.__convert_entities(response.annotations)

    def __convert_entities(self, annotations):
        converted_entities = []
        for annotation in annotations:
            entity_type = self.__convert_types(annotation.types)
            converted_entity = Entity(annotation.label, entity_type,
                                      annotation.start, annotation.end)
            converted_entities.append(converted_entity)
        return converted_entities

    def __convert_types(self, types):
        entity_type = EntityType.THING
        if len(types) > 0:
            for t in types:
                if t in DandelionEntityExtractor.__dbpedia_type_to_entity_type:
                    entity_type = DandelionEntityExtractor.__dbpedia_type_to_entity_type[
                        t]
                    break
        return entity_type
コード例 #14
0
ファイル: analyse.py プロジェクト: Conphas221/Chatbot
def AnalyseText(text):
    datatxt = DataTXT(app_id='cd32413268454e19a31776d33b5f0ba0',
                      app_key='cd32413268454e19a31776d33b5f0ba0')
    response = datatxt.nex(text, include="categories")

    return response.annotations
コード例 #15
0
import dandelion
from dandelion import DataTXT
token = '3d86a1a88bc4456c91f82a0d6043a31f'

from dandelion import default_config
default_config['token'] = token
datatxt = DataTXT()


def analysis(t1, t2):

    #"never" uses always the semantic algorithm
    semantic = datatxt.sim(t1, t2, binow='never')
    return round(semantic['similarity'] * 100, 2)
コード例 #16
0
import API_KEYS
from dandelion import DataTXT
datatxt = DataTXT(app_id='YOUR_APP_ID', app_key='YOUR_APP_KEY')
コード例 #17
0
ファイル: dandelion-tag.py プロジェクト: joestazak/research
from __future__ import print_function
from dandelion import DataTXT
import sys
import os

client = DataTXT(app_id='9d7ee60076304802b131eccf185700c4',
                 app_key='9d7ee60076304802b131eccf185700c4')


def process(line):
    if len(line) > 0:
        response = client.nex(line, lang='en', social_hashtag='true')
        return ",".join([
            os.path.basename(annotation.uri)
            for annotation in response.annotations
        ]).encode('utf-8')
    else:
        return ""


def main():
    try:
        for line in sys.stdin:
            print(process(line.strip()))
    except:
        print("FAIL! " + line, file=sys.stderr)
        raise


if __name__ == "__main__":
    main()
コード例 #18
0
            to_file.close()
            to_file_lemma.flush()
            to_file_lemma.close()
            exit(0)


if __name__ == '__main__':
    path_from = input(f'input file: ')
    path_to = input(f'output file: ')
    path_to_lemma = input(f'output file lemma: ')
    row_from = input(f'row from: ') # 5320
    sport = input(f'sport: ')
    confidence = input(f'confidence: ')
    #count_dandelion = input(f'dandelion requests: ')
    #token = input(f'token: ')       
    datatxt = DataTXT(token='')
    count_dandelion = 0
    s = sparql.Service('http://dbpedia.org/sparql', qs_encoding='utf-8')
    nlp = spacy.load("en_core_web_sm")
    lemmatizer = WordNetLemmatizer()
    # Per evitare di splittare su (
    prefixes = list(nlp.Defaults.prefixes)
    prefixes.remove('\\(')
    prefix_regex = spacy.util.compile_prefix_regex(prefixes)
    nlp.tokenizer.prefix_search = prefix_regex.search
    # Per evitare di splittare su )
    suffixes = list(nlp.Defaults.suffixes)
    suffixes.remove('\\)')
    suffix_regex = spacy.util.compile_suffix_regex(suffixes)
    nlp.tokenizer.suffix_search = suffix_regex.search
    infixes = (
コード例 #19
0
def get_entities_from_dandelion(text):
    # TODO: mettere le keys in un file di setting
    datatxt = DataTXT(app_id='7c418708',
                      app_key='0043c60be84a1f471184a192fe06e540')
    result = datatxt.nex(text, include_lod=True, language='en')
    return result
コード例 #20
0
 def __init__(self, app_id, app_key):
     self.app_id = app_id
     self.app_key = app_key
     self.datatxt = DataTXT(app_id=self.app_id, app_key=self.app_key)
コード例 #21
0
 def setUp(self):
     default_config['app_id'] = os.environ['APP_ID']
     default_config['app_key'] = os.environ['APP_KEY']
     self.datatxt = DataTXT()
コード例 #22
0
ファイル: save_documents_mongodb.py プロジェクト: linkTDP/CSA
def get_annotation_dandelion(text):
    datatxt = DataTXT(app_id='0b2b87bc', app_key='7f0ae25400535758e9ceae358b3db763')

    result =datatxt.nex(text.decode('latin-1'),include_lod=True,language='en')
    
    pprint(result)
コード例 #23
0
    stopwords_file = arguments['--stopwords']
    stopwords = read_stopword(stopwords_file)

    new_annotations = arguments['--new-annotations']
    different_annotations = arguments['--different-annotations']

    processed_items = arguments['--processed-items']

    stemmer = Stemmer('italian')

    app_id = config.get('keys', 'app_id')
    app_key = config.get('keys', 'app_key')
    cache_dir = config.get('cache', 'cache_dir')

    datatxt = DataTXT(app_id=app_id,
                      app_key=app_key,
                      cache=FileCache(cache_dir)
                      )

    g = SKOSGraph()
    g.parse(infile, format='xml')

    query = u'SELECT DISTINCT ?a ?b WHERE { ?a skos:prefLabel ?b .}'
    qres = g.query(query, initNs=dict(skos=SKOSNS))

    i = 0
    tot = len(qres)
    print tot
    for subject_url, name in qres:
        i = i + 1
        name = unicode(name)
コード例 #24
0
#!/usr/bin/env python

from dotenv import load_dotenv
from dandelion import DataTXT

import speech_recognition as sr
import random
import yaml
import os

load_dotenv()

r = sr.Recognizer()
mic = sr.Microphone(device_index=0)
datatxt = DataTXT(token=os.getenv('TOKEN'))


def compare(cmd: str, cmds: map):
    best_cmd = list(cmds)[0]
    similiarity = 0
    for cmd_ in cmds:
        res = datatxt.sim(cmd_, cmd, lang='en')
        print('>', cmd_, res['similarity'])
        if res['similarity'] > similiarity:
            best_cmd = cmd_
            similiarity = res['similarity']

    return best_cmd


def listen():
コード例 #25
0
import os
import nltk
import tokenize
from nltk import ne_chunk
from nltk.parse import stanford
from nltk.parse.stanford import StanfordDependencyParser
from graphviz import Source
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.moses import MosesDetokenizer
from dandelion import DataTXT

datatxt = DataTXT(app_id='78697915c52f48f5b3bd6c7bb603b2a2',
                  app_key='78697915c52f48f5b3bd6c7bb603b2a2')
h, k = 100, 100
#num = 0
l = 9
i = 0
j = 0
main = []
trial = []
tagged = [[0 for x in range(h)] for y in range(k)]
named = [[0 for x in range(h)] for y in range(k)]
list1 = [[0 for x in range(h)] for y in range(k)]
detoken = [[0 for x in range(h)] for y in range(k)]
ini_path = 'C:\Stanford'
os.environ['STANFORD_PARSER'] = 'C:\Stanford\stanford-parser.jar'
os.environ['STANFORD_MODELS'] = 'C:\Stanford\stanford-parser-3.5.2-models.jar'
os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk1.8.0_161/'
parser = stanford.StanfordParser(
    'C:\Stanford\stanford-parser.jar',
コード例 #26
0
import requests
import json
import csv
from dandelion import DataTXT
import dandelion
import pandas as pd

datatxt = DataTXT(app_id='', app_key='ENTER YOUR TWITTER API KEY')
if __name__ == '__main__':
    query1 = raw_input("Enter the input claim:")
    inputFile = raw_input("Enter the file name to be checked for:")
    inputFile = inputFile + ".csv"
    colnames = [
        "username", "date", "retweets", "favorites", "text", "geo", "mentions",
        "hashtags", "id", "permalink"
    ]
    with open(inputFile) as csvfile:
        inputReader = pd.read_csv(csvfile, sep="|", error_bad_lines=False)
        textReader = inputReader['text']
        outputFile = inputFile.split('.')[0] + "_results.csv"
        with open(outputFile, "w") as f:
            try:
                for i in range(0, inputReader.shape[0]):
                    query2 = textReader[i]
                    print(query2)
                    try:
                        response = datatxt.sim(query1, query2)
                        if response.similarity > 0.4:
                            f.write(str(i))
                            f.write('|')
                            f.write(query2)
コード例 #27
0
    def test_can_set_host(self):
        self.datatxt = DataTXT(host="api.dandelion.eu")
        self.test_nex()

        self.datatxt = DataTXT(host="http://api.dandelion.eu")
        self.test_nex()
コード例 #28
0
import datetime
import codecs
import time
import json

from dandelion import DataTXT

with open("config.json") as fin:
    config_data = json.load(fin)

#datatxt = DataTXT(app_id = config_data['application_id'], app_key = config_data['application_key'])
datatxt = DataTXT(token = config_data['token'])


def simple_clean(text):
    text = " ".join(text.replace("’","'").split())
    return text.lower()



#import spacy
#nlp = spacy.load('it_core_news_sm',
#                 disable=["tagger", "parser", "ner"])
#def spacy_clean(text):
#    text = " ".join(text.replace("’","'").split())
#    doc = nlp(text)
#    tokens = [token.lemma_.strip() for token in doc if
#              not token.is_stop
#              and not nlp.vocab[token.lemma_].is_stop
#              and not token.is_punct
#              and not token.is_digit
コード例 #29
0
 def setUp(self):
     # Retrieve all tweets
     tweets = list(mongo_manager.MongoManager(configuration.db_name).find("tweets", {}))[10:16]
     self.datatxt = DataTXT(app_id=configuration.APP1_ID, app_key=configuration.API_KEY_DANDELION1)
     self.t = tweets_chunk.TweetsChunk(tweets)
コード例 #30
0
def loginDandelion():
    file = open('credentialsDandelion.json')
    keys = json.load(file)
    datatxt = DataTXT(app_id=keys['app_id'], app_key=keys['app_key'])
    return datatxt
コード例 #31
0
def final_score(event, keywords):

	textrazor.api_key = "9dcd16199684c470157ce02dc8ced9357b28f61dd685df6acc8dfd62"
	infocsv = pd.read_csv(event.csv_file.path, header=None)
	print("INFOOOO")
	print(infocsv.shape)
	print(infocsv.iloc[2,2])
	dandelionclient = DataTXT(app_id = '9355e03c7d5e4b879e6af9d8575159d2', app_key = '9355e03c7d5e4b879e6af9d8575159d2')
	# keywords = "reactjs, react.js, redux, React.js"

	a=[]
	output = []

	for count in range(infocsv.shape[0]):
		applicant = Applicant()
		applicant.name = str(infocsv.iloc[count, 0])
		applicant.college = str(infocsv.iloc[count, 1])
		applicant.email = str(infocsv.iloc[count, 2])
		applicant.github_url = str(infocsv.iloc[count,3])
		if(applicant.github_url == "nan"):
			applicant.delete()
			break
		applicant.quora_url = infocsv.iloc[count,4]
		applicant.resume_link = str(infocsv.iloc[count,5])
		applicant.number = infocsv.iloc[count, 6]
		applicant.event = event
		applicant.save()
		print("resume_link")
		print(applicant.resume_link)

		print("RESUME INFO")
		# if __name__ == "__main__":
		words = applicant.resume_link.split('/')
		file_id = words[len(words)-2]
		print("File ID", file_id)
		destination = './' +file_id + '.pdf'
		print("Destination:", destination)
		download_file_from_google_drive(file_id, destination)

		convertapi.api_secret = 'Zgeg7qFLxqDtCAJr'
		result = convertapi.convert('txt', { 'File': './' + file_id + '.pdf' })
		result.file.save('./')

		f1 = open('./' + file_id + '.txt', "r", encoding="utf8")
		resumeinfo = f1.read()
		print(resumeinfo)
		print("="*100)
		try:
			client = textrazor.TextRazor(extractors=["entities", "topics"])
			response = client.analyze(resumeinfo)
			related_keyword_resume=[]
			for topic in response.topics():
				if topic.score>0.7:
					related_keyword_resume.append(topic.label)
			rel_key_resume=', '.join(related_keyword_resume)
			print(rel_key_resume)
			r = dandelionclient.sim(rel_key_resume, keywords, lang="en", bow="one_empty")
			resumesimilarity = r.similarity*25
		except:
			resumesimilarity = 0
		print("--"*100)

		print("QUORA INFO")
		quorainfo = get_user_info_quora(applicant.quora_url)
		print(quorainfo)
		print("="*100)
		if(quorainfo is not ""):
			try:
				client = textrazor.TextRazor(extractors=["topics"])
				response = client.analyze(quorainfo)
				related_keyword_qra=[]
				for topic in response.topics():
					if topic.score>0.7:
						related_keyword_qra.append(topic.label)
				rel_key_quora=', '.join(related_keyword_qra)
				print(rel_key_quora)
				r = dandelionclient.sim(rel_key_quora, keywords, lang="en", bow="one_empty")
				quorasimilarity = r.similarity*15
			except Exception as e:
				print(e)
				quorasimilarity = 0
		else:
			quorasimilarity = 0
		print("--"*100)

		print("GITHUB INFO")
		gitinfo = get_user_info_git(applicant.github_url)[0]
		print(gitinfo)
		print("=="*100)
		try:
			client = textrazor.TextRazor(extractors=["topics"])
			response = client.analyze(gitinfo)
			related_keyword_git=[]
			for topic in response.topics():
				if topic.score>0.7:
					related_keyword_git.append(topic.label)
			rel_key_git=', '.join(related_keyword_git)
			print(rel_key_git)
			print("--"*100)
			r = dandelionclient.sim(rel_key_git, keywords, lang="en", bow="one_empty")
			gitsimilarity = r.similarity*60
		except:
			gitsimilarity = 0
		print("+"*100)
		print(quorasimilarity, resumesimilarity, gitsimilarity)
		a.append(quorasimilarity+resumesimilarity+gitsimilarity)
		applicant.score = a[-1]
		applicant.save()
		output.append(applicant)

	output.sort(key=lambda x: x.score, reverse=True)
	print(a)
	return output
コード例 #32
0
def get_entities_from_dandelion(text):
    # TODO: mettere le keys in un file di setting
    datatxt = DataTXT(app_id='7c418708', app_key='0043c60be84a1f471184a192fe06e540')
    result = datatxt.nex(text, include_lod=True, language='en')
    return result
コード例 #33
0
import tqdm
import _pickle as cPickle
from DataReader import DataReader
from dandelion import DataTXT
from dandelion import default_config
from TweetNormalizer import normalizeTweet

# Initializing Dandelion API (can be obtained from https://dandelion.eu/)
default_config['token'] = 'INSERT TOKEN'
datatxt = DataTXT()

# Loading data
dr_tr = DataReader('./Data/olid-training-v1.tsv', 'A')
data_tr, labels_tr = dr_tr.get_labelled_data()
dr_tst = DataReader('./Data/testset-levela.tsv', 'A')
data_tst, label_tst = dr_tst.get_test_data()

data_tr = data_tr[:]
data_tst = data_tst[:]

entities_tr = []
entities_tst = []

# Entity extraction using dandelion
for line in tqdm.tqdm(data_tr):
    temp = []
    for annotation in datatxt.nex(normalizeTweet(line), lang='en').annotations:
        temp.append(annotation.title)
    entities_tr.append(temp)

for line in tqdm.tqdm(data_tst):
コード例 #34
0
response = urllib2.urlopen(m.group(1))
html = response.read()

cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True

#print lxml.html.tostring(cleaner.clean_html(lxml.html.parse(url)))
clean = cleaner.clean_html(lxml.html.parse(url))
clean = lxml.html.tostring(clean)

soup = BeautifulSoup(clean, 'lxml')
text = soup.get_text()

datatxt = DataTXT(app_id='d40305b7',
                  app_key='7d432531dfb0d3173212d4203f25d4b6')

#response = datatxt.sim(text, "The ultimate skel-ebration of monster mania, this year's Monster High dance will be the monster bash to end all bashes (if it happens)! And as the Monster High ghouls make new beast friends, the horror show really begins. This freaky fabulous new character is larger than unlife at 17 inches tall! And of course, she wears an over-the-tent fashion with lots of ")

paragraphs = list()
match = list()

for line in text.splitlines():
    if len(line) > 20:
        paragraphs.append(line)

paragraphs = paragraphs[0:5]
for p in paragraphs:
    response = datatxt.sim(
        p,
        "The ultimate skel-ebration of monster mania, this year's Monster High dance will be the monster bash to end all bashes (if it happens)! And as the Monster High ghouls make new beast friends, the horror show really begins. This freaky fabulous new character is larger than unlife at 17 inches tall! And of course, she wears an over-the-tent fashion with lots of "