Esempio n. 1
0
def extract_entities_textrazor(snippet):
    """
    this function extract two entities (university name, researcher name and numebers)
    and confidence scores using the textrazor package
    :param snippet:
    :return:
    """

    textrazor.api_key = "7b4d6194cabab0a5c05bd34ad0ba423520a4a3d33a0304b9783971c9"

    client = textrazor.TextRazor(extractors=["entities", "topics"])
    response1 = client.analyze(snippet) #"Pegah Alizadeh was born september 21 1983 in Ahvaz, Iran. she finished her PHD at "
                               #"university of PARIS13 2016.")

    output = {'RN':{'entity':[], 'confidenceScore':[]}, 'U':{'entity':[], 'confidenceScore':[]},
          'Y':{'entity':[], 'confidenceScore':[]} }

    for entity in response1.entities():
        #print(entity.json)
        """if list is not empty"""
        if len(entity.freebase_types) > 0:
            if entity.freebase_types[0]  == ['/people/person']: #.__contains__('person'):
                output['RN']['entity'].append( (entity.json['entityId']).split(' ') )
                output['RN']['confidenceScore'].append( entity.confidence_score )
            elif entity.freebase_types[0].__contains__('organization'): #== ['/organization/organization']:
                output['U']['entity'].append( (entity.json['entityId']).split(' ') )
                output['U']['confidenceScore'].append(entity.confidence_score)

        else:
            if 'type' in entity.json:
                if entity.json['type'] == ['Number']:
                    output['Y']['entity'].append((entity.json['entityId']).split(' '))
                    output['Y']['confidenceScore'].append(entity.confidence_score)

    return output
Esempio n. 2
0
def get_keywords(text):
    '''
    Finding the keywords of a text with textrazor
    Maximum query instances of textrazor is 500
    Input: 
    text: string

    Output: dict
    '''
    textrazor.api_key = "1b69ecec7f8c72d386c2c5280780e6eb6ec00510e2a221d98e246c82"
    client = textrazor.TextRazor(extractors=["entities", "topics"])

    response = client.analyze(text)
    entities = list(response.entities())
    entities.sort(key=lambda x: x.relevance_score, reverse=True)
    seen = set()
    result = {}
    for entity in entities:
        if entity.id not in seen:
            seen.add(entity.id)
            list_of_cats = []
            set_of_cats = set()
            for i in list(entity.freebase_types):
                
                i.split("/")
                if i[0] != "/":
                    i = i[0]
                else:
                    i = i.split("/")[1]
                if i in ['travel','projects','location','arts', 'food', 'sports', 'media_common', 'exhibition', 'architecture', 'geography', 'visual_art', 'travel', 'protected_sites','sports']:
                    list_of_cats.append(i)
            if list_of_cats != []:        
                result[entity.id] = list_of_cats
    return result
Esempio n. 3
0
def categarize(filename):
    finalcat = []
    finalscore = []
    textrazor.api_key = "f8656917eff9fdb7989aafbb22a8c8e1b74ebd076f1040c75de4dfcc"
    client = textrazor.TextRazor(extractors=["entities", "topics"])
    # client.set_cleanup_mode("cleanHTML")
    path = app.config['UPLOAD_FOLDER'] + '//' + filename
    client.set_classifiers(["textrazor_newscodes"])
    #input_file = file(path).read().decode("ISO-8859-1")
    input_file = file(path).read().decode("utf-8")
    r = Rake()
    r.extract_keywords_from_text(input_file)
    #print(r.get_ranked_phrases())
    startLines = input_file[0:100]
    #print(startLines)
    # os.system("LSA.py")
    response = client.analyze(input_file)
    entities = list(response.entities())
    entities.sort(key=lambda x: x.relevance_score, reverse=True)
    seen = set()
    keywords = list()
    info = list()
    for entity in entities:
        if entity.id not in seen:
            #print (entity.id, entity.relevance_score, entity.confidence_score, entity.freebase_types)
            seen.add(entity.id)
            keywords.append(entity.id)
    mydb.keywords.insert({"keywords": keywords, "name": filename})
    print("--------------------------------------------")
    topiclist = list()
    for topic in response.topics():

        if topic.score > 0.3:
            #print (topic.label)
            topiclist.append(topic.label)
            mydb.topic.insert({"topic": topic.label})

    print("------------------------------------------------------")
    categorylist = list()
    try:
        for category in response.categories():
            alterLabel = (category.label).split(">")
            finalcat.append(alterLabel[-1])
            finalscore.append(category.score)

            k = finalcat[0]
            s = finalscore[0]

            print(category.label)
            # print(alterLabel[-1])
            # print category.score
            categorylist.append(alterLabel[-1])
            mydb.category.insert({"category": alterLabel[-1]})
        mydb.doccat.insert({"classified": k, "Document": filename, "Score": s, "startLines": startLines})
        mydb.record.insert(
            {"name": filename, "description": [{"keywords": keywords, "topic": topiclist, "category": categorylist}]})
        output = "Category : " + str(k)
        return jsonify(result=output)
    except:
        return jsonify(result="unable to categarize")
Esempio n. 4
0
def get_tags(text):
    textrazor.api_key = "631d67844c4e5bf22a4dfe37afcd0f08a3c330b54a8ca798a0970846"

    client = textrazor.TextRazor(extractors=["topics"])

    # classifiers=['textrazor_mediatopics', 'textrazor_newscodes', 'textrazor_iab', 'textrazor_iab_content_taxonomy']
    client.set_classifiers(['textrazor_iab'])

    response = client.analyze(text)
    if not response.ok:
        print(response.error)
        print(response.message)
        return []

    tags = []
    for c in response.categories():
        if c.score > 0.5:
            category = re.sub(r"[>]+", "/", c.label)
            tag = category.split('/')[-1]
            tag = re.sub(r"[\s&]+", "_", tag)

            if len(tag) > 0:
                tag = '#' + tag.lower()
                tags.append(tag)

    for c in response.topics():
        if c.score == 1.0:
            tag = c.label
            tag = re.sub(r"[\s&]+", "_", tag)
            tag = '#' + tag.lower()
            tags.append(tag)

    return tags
Esempio n. 5
0
def get_wikilinks(facts):
    textrazor.api_key = "f87456a08da3eff12e62ebdb2bcf2a8be4baaeb4b79be19fce12f770"
    client = textrazor.TextRazor(extractors=["entities", "topics"])
    for fact in facts:
        response = client.analyze(fact.text)
        json_response = response.json
        print(json_response)
        exit()
def textrazorAPI(df):
    text_content = df['article']
    client = textrazor.TextRazor(extractors=["entities", "topics", "dependency-trees",
                                             "relations", "entailments", "senses"])
    response = client.analyze(text_content)

    response_json = response.json

    # list of dictionaries. Dictionary keys are id, label, score, wikiLink, wikidataId
    try:
        df['textrazorAPItopics'] = response_json['response']['topics']
    except KeyError:
        df['textrazorAPItopics'] = np.nan

    # list of dictionaries. Dictionary keys are id, label, score, wikiLink, wikidataId
    try:
        df['textrazorAPIcoarseTopics'] = response_json['response']['coarseTopics']
    except KeyError:
        df['textrazorAPIcoarseTopics'] = np.nan

    #  list of dictionaries. Dictionary keys are
    try:
        df['textrazorAPIentities'] = response_json['response']['entities']
    except KeyError:
        df['textrazorAPIentities'] = np.nan

    # list of two dictionaries. Dictionaries are two entities with relation:
    # keys are id, param - list of dict, keys are:
    # relation, wordPositions (key to list of ints)
    try:
        df['textrazorAPIrelations'] = response_json['response']['relations']
    except KeyError:
        df['textrazorAPIrelations'] = np.nan

    # list of dictionaries. Dictionary keys are:
    # position - int with the position of the sentence
    # words - dict with keys endingPos, lemma, parentPosition, partOfSpeech, position,
    # relationToParent, startingPos, stem, token
    try:
        df['textrazorAPIsentences'] = response_json['response']['sentences']
    except KeyError:
        df['textrazorAPIsentences'] = np.nan

    # list of dictionaries. Dictionary keys are contextScore, entailedTree(key to a list),
    # entailedWords(key to a list), id, prior score, score, wordPositions(key to a list)
    try:
        df['textrazorAPIentailments'] = response_json['response']['entailments']
    except KeyError:
        df['textrazorAPIentailments'] = np.nan

    # list of dictionaries. Dictionary keys are id, propertyPositions(key to a list),
    # wordPositions(key to a list)
    try:
        df['textrazorAPIproperties'] = response_json['response']['properties']
    except KeyError:
        df['textrazorAPIproperties'] = np.nan

    return df
Esempio n. 7
0
def analyzeText(text):
    textrazor.api_key = "b695217cdaeb234d8a4edd867e1ab59b23aa1d050fa063c5f4a3a89a"
    client = textrazor.TextRazor(extractors=["topics"])
    response = client.analyze(text)
    topic = map(lambda x: str(x.label), response.topics()[:8])
    ans = ""
    for topica in topic:
        ans += topica + "</h6> <h6>"
    return ans
Esempio n. 8
0
 def __init__(self, url, username):
     client = textrazor.TextRazor(extractors=["entities", "topics"])
     client.set_cleanup_mode("cleanHTML")
     client.set_cleanup_return_cleaned(return_cleaned=True)
     client.set_classifiers(["textrazor_newscodes"])
     self.url = url
     self.username = username
     self.response = client.analyze_url(url)
     self.response.entities().sort(key=lambda x: x.relevance_score, reverse=True)
Esempio n. 9
0
def concept_extract(text):
    client = textrazor.TextRazor(YOUR_API_KEY, extractors=["entities"])
    response = client.analyze(text)
    concept_set = ([])
    for entity in response.entities():
        if 'entityId' in entity.json:
            concept_set.append(entity.json['entityId'])
            print(entity.json)
    return concept_set
 def get_news_statistics(self, text):
     """Retrieve the list of topics(keywords) from the API for the given text."""
     textrazor.api_key = self.textrazor_apikey
     client = textrazor.TextRazor(extractors=["topics","entities"])
     client.set_classifiers(["textrazor_newscodes"])
     response = client.analyze(text)
     if len(response.topics())==0 and len(response.entities())==0 and len(response.categories())==0:
         return None
     return response
Esempio n. 11
0
def get_enrichment(book_uri, synopsis, rdf_graph):

    client = textrazor.TextRazor(extractors=["entities", "topics"])
    response = client.analyze(synopsis)

    if response.ok == True:
        response2graph(book_uri, response, synopsis, rdf_graph)
    else:
        print('Error: ', response.error)
 def get_entities_from_url(self, url):
     try:
         textrazor.api_key = TEXT_RAZOR_API_KEY_1
         client = textrazor.TextRazor(
             extractors=[ENTITES, RELATIONS, TOPICS])
         response = client.analyze_url(url)
         return response
     except Exception, e:
         print("ERROR For URL: {0} - {1}".format(url, str(e)))
         raise Exception()
 def get_entities_from_text(self, text):
     try:
         textrazor.api_key = TEXT_RAZOR_API_KEY_1
         client = textrazor.TextRazor(
             extractors=[ENTITES, RELATIONS, TOPICS])
         response = client.analyze(text)
         return response
     except Exception, e:
         print(str(e))
         raise Exception(str(e))
Esempio n. 14
0
def print_topic_score(url):
    """Print 10 most probable topics and confidence score for a text from a text on a website url passed as input."""
    textrazor.api_key = "fab1f5ef253a7daa2ec64726f01738f24bf84c59dde7c66f1ec1cd04"

    client = textrazor.TextRazor(extractors=["topics"])
    client.set_language_override('fre')
    response = client.analyze_url(url)

    for topic in response.topics()[:10]:
        print(topic.label, topic.score)
Esempio n. 15
0
 def detect_target_garment(content, target):
   client = textrazor.TextRazor(api_key="13b22cd6d8562948feeddee54a992ef4edfb1b9d8c3df54a70f40810", extractors=["entities"])
   client.set_entity_freebase_type_filters(["/fashion/garment", "/business/product_category"])
   to_analyze = content
   response = client.analyze(to_analyze)
   garment = []
   for ent in response.entities():
     garment.append(ent.id)
   if target in garment:
     return True
Esempio n. 16
0
def getTopic(string):
    return dict()
    topic_distri = dict()
    if string == "":
        return topic_distri
    textrazor.api_key = "528d21faef2b391e46cc77bfa8b1a9d28dd00a7f77ee562f2811520a"
    client = textrazor.TextRazor(extractors=["topics"])
    response = client.analyze(string)
    for topic in response.coarse_topics():
        topic_distri[topic.label] = topic.score
    return topic_distri
    def __init__(self, aiml_instance):
        textrazor.api_key = "d64cc7e640600e8e2305304d8e79e6b945b575825a72ce4b853da187"
        self.client = textrazor.TextRazor(extractors=["dependency-trees"])
        self.max_position = 0
        self.mark_array = []
        self.token_array = []
        self.phrase = []

        self.marker = ["xcomp", "advcl", "conj"]

        self.aiml_instance = aiml_instance
Esempio n. 18
0
def run(query):

	dir_path = os.path.dirname(os.path.realpath(__file__))
	
	with open(os.path.join(dir_path, 'configurations/config.txt')) as data_file:    
	    json_obj = json.load(data_file)

	# Initialize the API keys
	wclient           = wolframalpha.Client(json_obj['wolframalpha'])
	textrazor.api_key = json_obj['textrazor']
	
	# Extract the true meaning of the sentence
	tclient  = textrazor.TextRazor(extractors=["entities"])
	response = tclient.analyze(query)
	query    = response.entities()[0].id

	# Perform query
	response = wclient.query(query)

	for pod in response.pods:

		if pod.title=='Wikipedia summary' and pod.text != None:
			print 'Wikipedia summary : ' + pod.text

		if pod.title=='Response':
			print pod.text

		if pod.title=='Basic information':
			print pod.text.split('\n')[0] + '\n' + pod.text.split('\n')[1]

		if pod.title=='Result' or pod.title=='Current result' or pod.title=='Approximate result' or pod.title=='Results' or pod.title=='Average result':
			print pod.text

		if pod.title=='Notable facts':
			print '* ' + pod.text.split('\n')[0] + '\n* ' + pod.text.split('\n')[1] + '\n* ' + pod.text.split('\n')[2]

		if pod.title=='Bordering countries/regions':
			print 'Bordering countries/regions -> ' + pod.text

		if pod.title=='Location':
			print pod.text

		if pod.title=='Capital city':
			print 'Capital city -> ' + pod.text
		
		if pod.title=='Currency':
			print 'Currency -> ' + pod.text.split('\n')[1]

		if pod.title=='Value':
			print pod.text.split('\n')[0]

		if pod.title=='Morse code translation':
			print pod.text
Esempio n. 19
0
def getWikipediaLinks(textToAnnotate):
    wikipediaLinks = []
    client = textrazor.TextRazor(extractors=["entities"])
    response = client.analyze(textToAnnotate)
    for entity in response.entities():
        wikipediaLinks.append(entity.wikipedia_link)

    wikipediaLinks = list(dict.fromkeys(wikipediaLinks))  #Removing duplicates
    wikipediaLinks = list(filter(None,
                                 wikipediaLinks))  #Removing empty strings

    return wikipediaLinks
Esempio n. 20
0
def classification(text):
    """news classification using text razor api"""

    textrazor.api_key = "2afab77eb63718df82c96d0669e0017cb0c6bcabb2c0ae4044fa58a7"
    client = textrazor.TextRazor(extractors=["entities", "topics"])
    client.set_classifiers(["textrazor_newscodes"])
    response = client.analyze(text)

    categories = response.categories()
    category = categories[0].label

    return category
Esempio n. 21
0
def textrazor(item, tool_name):
    text = item["text"] #.encode('utf-8')
    dpaId = item["dpaId"]
    
    textrazor_function.api_key=api_key
    #text=text.encode('UTF-8')  
    client = textrazor_function.TextRazor(extractors=["entities","words"])
    #response = client.analyze_url("http://www.bbc.co.uk/news/uk-politics-18640916")
    try:
        response = client.analyze(text)

        if response.ok != True:
            output=[False,response.message]
        else:
            if response.language != "ger":
                output=[False,response]
            else:
                entities_list=[]
                if len(response.entities())==0:
                    output=[True,entities_list]
                else:
                    t=time.time()
                    annotation=[]   
                    for entity in response.entities():
                        label = entity.id
                        if entity.wikidata_id == None:
                            uri="QO"
                            category = "OTH"
                        else:
                            uri=entity.wikidata_id
                            category = query_category(uri)
                        surface = entity.matched_text
                        position = entity.matched_positions
                        start=entity.starting_position
                        end=entity.ending_position
                        timestamp = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.utcfromtimestamp(t))
                        insert_dict={
                            "start" : start,
                            "end" : end,
                            "label" : label,
                            "surface" : surface,
                            "uri" : uri,
                            "category_tool" : "",
                            "category" : category,
                            "dpaid" : dpaId,
                            "timestamp" : '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.utcfromtimestamp(t)),
                            "tool" : tool_name
                            }
                        annotation.append(insert_dict)
                    output=[True,annotation]
    except TextRazorAnalysisException:
            output=[False,"http error"]
    return(output)
Esempio n. 22
0
def concept_extract_save(filepath, text):
    client = textrazor.TextRazor(YOUR_API_KEY, extractors=["entities"])
    response = client.analyze(text)
    output_file = open(filepath, 'w+', -1, 'UTF-8')
    # concept_set = ([])
    concept_set = []
    for entity in response.entities():
        if 'entityId' in entity.json:
            concept_set.append(entity.json['entityId'])
        print(entity.json)
        output_file.write(str(entity.json) + '\n')
    output_file.close()
Esempio n. 23
0
def get_similarity_with_topic(transcript, speechTopic, categories):
    client = textrazor.TextRazor(extractors=["topics"])
    textrazor_resonse = client.analyze(transcript)
    topic_list = textrazor_resonse.topics()
    print('topic list is',topic_list)
    keyword_list = []
    for topic in topic_list:
        keyword_list.append(topic.label)

    category_list = textrazor_resonse.categories()
    print(category_list)
    return keyword_list
    '''max_similarity = -1000
Esempio n. 24
0
def get_entities(text):
    client = textrazor.TextRazor(extractors=["entities", "topics"])
    response = client.analyze(text)
    ret = []
    for entity in response.entities():
        if (int(entity.relevance_score) + int(entity.confidence_score) > 0.5):
            ret.append({
                'id': entity.id,
                'relevance_score': entity.relevance_score,
                'confidence_score': entity.confidence_score,
                'freebase_types': entity.freebase_types
            })
    return ret
def text_razor(repo_description):
    """
  Use TextRazor to process textual input
  """

    textrazor.api_key = '5f6331ac5ecb61dfe6e57d9706eeb4f9e7bceaa82a4a37b128cb0201'
    textrazor.language_override = 'en'
    client = textrazor.TextRazor(extractors=['entities'])
    response = client.analyze(repo_description)
    phrases = response.entities()
    keywords = [item.matched_text for item in phrases]
    kw_str = ', '.join(keywords)

    return kw_str
def getKeywordsArray(site_url, min_relevance_score, min_topic_score):
    textrazor.api_key = "1f0ebd1fc796a631ec72919329071930fede6007817a81744071c643"
    client = textrazor.TextRazor(extractors=["entities", "topics"])
    response = client.analyze_url(site_url)

    for entity in response.entities():
        if entity.relevance_score > min_relevance_score and entity.id not in keywords:
            keywords.append(entity.id)

    for topic in response.topics():
        if topic.score > min_topic_score and topic.label not in keywords:
            keywords.append(topic.label)

    return keywords
Esempio n. 27
0
 def extract(self,
             text,
             extractors="entities,topics",
             lang="fr",
             min_confidence=0.0):
     self.lang = lang
     self.text = text
     lang = lang.replace("fr", "fre").replace("en", "eng")
     textrazor.api_key = self.api_key
     client = textrazor.TextRazor(extractors=["entities"])
     client.set_language_override(lang)
     response = client.analyze(text)
     entities = [entity.json for entity in response.entities()]
     self.annotations = entities
Esempio n. 28
0
def process_or_store(alltweets):
    textrazor.api_key = "813c3fc408c749a28006cca97f5865dca86c569f77ed08728b151bc0"
    client = textrazor.TextRazor(extractors=["entities", "topics"])
    #client.set_cleanup_mode("stripTags")
    alltweets = re.sub(r"(?:\@|https?\://)\S+", "", alltweets)
    client.set_classifiers(["textrazor_iab"])
    response = client.analyze(alltweets)
    print "##############################################################################################"

    print "\n\n\nEntities"
    for entity in response.entities():

        if entity.confidence_score > 0.7:

            if entity.dbpedia_types:

                for e in entity.dbpedia_types:
                    if entity.id not in entities_dictionary[e]:
                        entities_dictionary[e].append(entity.id)
                    if entity.id in score:
                        score[entity.id] += 1
                    else:
                        score[entity.id] = 1

    #print entities_dictionary

    topics = ""

    for topic in response.topics():
        topics += topic.label + "\n"

    print "\n\nAfter Topics"

    #for category in response.categories():
    #   print category.category_id, category.label, category.score

    response = client.analyze(topics)

    for entity in response.entities():
        if entity.confidence_score > 0.7:

            if entity.dbpedia_types:

                for e in entity.dbpedia_types:
                    if entity.id not in entities_dictionary[e]:
                        entities_dictionary[e].append(entity.id)
                        score[entity.id] = 1

    print entities_dictionary
    print score
def get(
    url='https://www.politico.com/news/2020/02/21/bernie-sanders-condemns-russian-116640'
):

    #    characteristics = ['Location', 'Event', 'Person', 'Organization']
    try:
        article = Article(url)
        article.download()
        article.parse()
    except:
        return {}

    text = article.title
    date = article.publish_date
    days_to_subtract = 2
    try:
        d = (date - timedelta(days=days_to_subtract)).strftime('%Y-%m-%d')
        d2 = (date + timedelta(days=days_to_subtract)).strftime('%Y-%m-%d')
    except TypeError:
        date = datetime.now()
        d = (date - timedelta(days=days_to_subtract)).strftime('%Y-%m-%d')
        d2 = (date + timedelta(days=days_to_subtract)).strftime('%Y-%m-%d')

    alt_api_key = 'feca0c9db3d492ac63a83761a41d003f306c5acfff3b828b8c1319da'
    textrazor.api_key = '3db6ae4b1e8b2e04ee07657ca98d0de9eda7b885b3043dc11ab9b230'

    client = textrazor.TextRazor(extractors=["words", "phrases"])
    response = client.analyze(text)

    query = ''
    for np in response.noun_phrases():
        query += '{} '.format(
            text[np.words[0].input_start_offset:np.words[-1].input_end_offset])

    print(query)

    news_parameters = {
        'q': query,
        'from': d,
        'to': d2,
        'sortBy': 'popularity',
        'apiKey': 'a02790e5a3af4b5f8683318c276e702d'
    }

    response = requests.get('http://newsapi.org/v2/everything',
                            params=news_parameters)
    json_data = json.loads(response.text)

    return json_data
Esempio n. 30
0
 def update_api_key(api_key):
     TextRazorManager.api_key = api_key
     textrazor.api_key = api_key
     TextRazorManager.client = textrazor.TextRazor(extractors=[
         'customAnnotations',
         'coarseTopics',
         'entailments',
         'properties',
         'nounPhrases',
         'sentences',
         'categories',
         "entities",
         "topics",
         'relations',
     ])