def test_text_tags(self): text = "On Monday, president Barack Obama will be..." results = text_tags(text) max_keys = sorted(results.keys(), key=lambda x:results.get(x), reverse=True) assert 'political_discussion' in max_keys[:5] results = text_tags(text, top_n=5) assert len(results) is 5 results = text_tags(text, threshold=0.1) for v in results.values(): assert v >= 0.1
def add_indico_text_tags(batch): article_texts = [article['content'] for article in batch] text_tags_dicts = indicoio.text_tags(article_texts, threshold=0.1) for i in range(len(batch)): text_tag_dict = text_tags_dicts[i] batch[i]['text_tags'] = text_tag_dict.items() return batch
def scrape(arg): results = {} twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2) ACCESS_TOKEN = twitter.obtain_access_token() twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN) friends = twitter.get_friends_ids(screen_name = arg) #hard cap for time limit = 10 for follower_id in friends['ids']: if limit > 0 : params = {'user_id':follower_id, 'count':3 } tweets = twitter.get_user_timeline(**params) for tweet in tweets: encoded = tweet['text'].encode("utf8") results = dict(Counter(results) + Counter(text_tags(encoded))) #random.choice(news) #f.write(str(text_tags(encoded))) limit = limit - 1 else: break #print results key = max(results.iteritems(), key=operator.itemgetter(1))[0] n = random.choice(news) return [key, n]
def analysis(data): sentiment = ind.sentiment_hq(data) tags = sort(ind.text_tags(data)) languages = sort(ind.language(data)) politics = sort(ind.political(data)) keywords = sort(ind.keywords(data)) names = sort(ind.named_entities(data)) print "Sentiment", sentiment print "\n\n\nTags" for t in tags: print t[0], float(t[1]) * 100 print "\n\n\nLanguages" for l in languages: print l[0], float(l[1]) * 100 print "\n\n\nPolitical" for p in politics: print p[0], float(p[1]) * 100 print "\n\nkeywords" for k in keywords: print k[0], float(k[1]) * 100
def identify_keywords(text): text = text.encode("ascii", "ignore") print len(text) x = indicoio.text_tags(text, threshold = 0.01, top_n = NUM_RESULTS) y = indicoio.keywords(text, threshold = 0.01, top_n = NUM_RESULTS) x.update(y) return x
def tweetCategory(getDF=False,insta=False): ''' :return: (text_classAndSenti,text_list) text_classAndSenti (selected samples,2): (:,0)=1 indicates that this tweets 1) is related to food 2) is positive. (:,0)=0 otherwise text_list: original lists of tweets ''' FOOD=["beer","cooking","general_food","vegan","vegetarian","wine","nutrition"] if not(insta): os.system('curl "https://boiling-fire-6168.firebaseio.com/twitter_data.json?print=pretty" > twitter_data.json') with open('twitter_data.json') as json_data: data = json.load(json_data) else: os.system('curl "https://boiling-fire-6168.firebaseio.com/twitter_data.json?print=pretty" > instagram_data.json') with open('instagram_data.json') as json_data: data = json.load(json_data) # JSON -> list of texts df = pd.DataFrame.from_dict(data) df = df.transpose() print(df.info()) lat = df['coordinate_1'] lng = df['coordinate_2'] in_toronto = [] for idx,x in enumerate(lat): in_toronto = in_toronto + [geo_results.is_in_circle(geo_results.TORONTO.latitude, geo_results.TORONTO.longitude, geo_results.radius, lng[idx], lat[idx])] print in_toronto df['in_toronto'] = in_toronto df = df[df['in_toronto'] == 1] print df text_list = df['text'].values.tolist() # Get topics indicoio.config.api_key = 'dfd155c0984bed63c78aef5ce44763bf' topics = indicoio.text_tags(text_list,top_n = 5) def topIncluded(topics,cat): # test if at least one element in topics is in cat for i in topics: if i in cat: return True return False # get sentiment analysis text_classAndSenti = np.zeros((len(text_list),2)) text_classAndSenti[:,1] = indicoio.sentiment(text_list) # put text into classes (Food is 1; otherwise, 0) for i,t in enumerate(topics): top_topics = t.keys() if topIncluded(top_topics,FOOD) and text_classAndSenti[i,1]>.5: text_classAndSenti[i,0] = 1 else: text_classAndSenti[i,1] = 0 # clear sentiment info of non-food tweets if getDF: return text_classAndSenti,text_list,df else: return text_classAndSenti,text_list
def score(self, slide_length, window_length, AItype='tags'): self.parse(slide_length, window_length) if AItype == 'tags': self.scores['tags'] = [indicoio.text_tags(i) for i in self.strings] elif AItype == 'keywords': self.scores['keywords'] = [indicoio.keywords(i) for i in self.strings] elif AItype == 'names': self.scores['names'] = [indicoio.named_entities(i) for i in self.strings] else: raise Exception('Warning: {} not a valid category'.format(category))
def parse(message, number): store(message, number) userProf = analyzeUser(number) if comparePrev(message, number): return "Message Sent" else: ent = entityMatch(message) if ent == "None": print "keywords" print indicoio.keywords(message, version=2) print "tags" print indicoio.text_tags(message, threshold = .03) print "relevance" print indicoio.relevance("Renowned soccer legend Pele will be visiting...", ["Germany", "relocation", "Food", "Safety", "Family", "Transportation", "clothing"]) else: "Found Entity, directing there" ticketCreate(message, number, ent)
def main(): resultFileName = 'result.txt' indicoio.config.api_key = 'dfdb33f299507185178450a350d1a40a' questionStr = sys.argv[1] resDict= indicoio.text_tags(questionStr) maxTag = max(resDict.items(), key=operator.itemgetter(1))[0] with open(resultFileName, 'w') as f: f.write(maxTag)
def post(self): data = json.loads(self.request.body) api = data.get('api') data = data.get('data') if api == 'sentiment': result = indicoio.sentiment(data) else: result = [aggregate_score(scores, api) for scores in indicoio.text_tags(data)] self.write(json.dumps(result)) self.finish()
def test_text_tags(self): expected_keys = set(['fashion', 'art', 'energy', 'economics', 'entrepreneur', 'books', 'politics', 'gardening', 'nba', 'conservative', 'technology', 'startups', 'relationships', 'education', 'humor', 'psychology', 'bicycling', 'investing', 'travel', 'cooking', 'christianity', 'environment', 'religion', 'health', 'hockey', 'pets', 'music', 'soccer', 'guns', 'gaming', 'jobs', 'business', 'nature', 'food', 'cars', 'photography', 'philosophy', 'geek', 'sports', 'baseball', 'news', 'television', 'entertainment', 'parenting', 'comics', 'science', 'nfl','programming', 'personalfinance', 'atheism', 'movies', 'anime', 'fitness', 'military', 'realestate', 'history']) text = "On Monday, president Barack Obama will be..." results = text_tags(text) max_keys = sorted(results.keys(), key=lambda x:results.get(x), reverse=True) assert 'politics' in max_keys[:3] self.assertFalse(set(results.keys()) - expected_keys)
def gimme_the_goods(text, tag_count=3, persona_count=3): # Consume some of that api for analysis sentiment = indicoio.sentiment(text) # TODO figure out a better way to handle this bug political = indicoio.political(text[0:1100]) personality = indicoio.personality(text) personas = indicoio.personas(text) tags = indicoio.text_tags(text, top_n=tag_count) # Sort the personas to grab top ones top_personas = dict(sorted(personas.items(), key=operator.itemgetter(1), reverse=True)[:persona_count]) # Truncate the values to 3 decimals for cleanliness roundness = 3 sentiment = truncate_values(sentiment, roundness) political = truncate_values(political, roundness) personality = truncate_values(personality, roundness) top_personas = truncate_values(top_personas, roundness) tags = truncate_values(tags, roundness) # Rearrange the personas a bit final_personas = [] for key, value in top_personas.items(): final_personas.append({ 'type': persona_mapping[key], 'name': key, 'value': value, }) return_dict = { 'sentiment': sentiment, 'political': political, 'personality': personality, 'personas': final_personas, 'tags': tags } return return_dict
def gimme_the_goods(text, tag_count=3, persona_count=3): # Consume some of that api for analysis sentiment = indicoio.sentiment(text) # TODO figure out a better way to handle this bug political = indicoio.political(text[0:1100]) personality = indicoio.personality(text) personas = indicoio.personas(text) tags = indicoio.text_tags(text, top_n=tag_count) # Sort the personas to grab top ones top_personas = dict( sorted(personas.items(), key=operator.itemgetter(1), reverse=True)[:persona_count]) # Truncate the values to 3 decimals for cleanliness roundness = 3 sentiment = truncate_values(sentiment, roundness) political = truncate_values(political, roundness) personality = truncate_values(personality, roundness) top_personas = truncate_values(top_personas, roundness) tags = truncate_values(tags, roundness) # Rearrange the personas a bit final_personas = [] for key, value in top_personas.items(): final_personas.append({ 'type': persona_mapping[key], 'name': key, 'value': value, }) return_dict = { 'sentiment': sentiment, 'political': political, 'personality': personality, 'personas': final_personas, 'tags': tags } return return_dict
def main(): if len(sys.argv) != 3: return inname = sys.argv[1] outname = sys.argv[2] with open(inname, mode='r') as inFile: tweets = json.load(inFile) count = 0 for tweet in tweets: result = indicoio.text_tags(tweet['text']) tweet.update(result) count += 1 if count % 100 == 0: print(count) with open(outname, 'w') as outfile: json.dump(tweets, outfile) with open(outname, 'w') as outfile: json.dump(tweets, outfile)
def get_user_interests(statement): tag_dict = indicoio.text_tags(statement, top_n=5) sorted_tags = sorted(tag_dict.items(), key=lambda tup: -tup[1]) return sorted_tags
import indicoio import operator indicoio.config.api_key = 'b94312524aff44f47c4cc57b9e56c5e6' # single example #print indicoio.keywords("Where do I get food in Lesbos?", version=2) #returns the words that are deemed most relevant print indicoio.relevance("Where do I get food in Lesbos?", ["food", "general_food"]) #returns list of proportions representing how relevant the word is to the string keyword_dictionary = indicoio.text_tags("Where do I get food in Lesbos?") top_word = max(keyword_dictionary.iteritems(), key=operator.itemgetter(1))[0] keyword_dictionary.pop(top_word) second_top_word = # batch example #print indicoio.keywords([ # "How do I get water nearby?", # "Where did my family go?" #], version=2) #returns the words that are deemed most relevant #print indicoio.relevance(["How do I get water nearby?", "Where did my family go?"], ["family"]) #returns list of proportions representing how relevant the word is to the string #print indicoio.text_tags([ # "The most common form of arrow consists of a shaft with an arrowhead attached to the front end and with fletchings and a nock attached to the other end.", # "Yoga in Indian traditions, however, is more than physical exercise, it has a meditative and spiritual core."
def text_tags_extraction(self, text): self.text_tags = text_tags(text)
def en_topics(s): tag_dict = indicoio.text_tags(s) return sorted(tag_dict.keys(), key=lambda x: tag_dict[x], reverse=True)[:5]
import indicoio indicoio.config.api_key = "13aaf3fb916e24d4d31bc4bf3e79cd6f" tag_dict = indicoio.text_tags("Iran Agrees to Nuclear Limits, But Key Issues Are Unresolved") print(sorted(tag_dict.keys(), key=lambda x: tag_dict[x], reverse=True)[:5]) # find amount of positivity. print( indicoio.sentiment('indico is so easy to use!') ) # 0.07062467665597527
string1 = fd.read() #fd = open("BBC.txt", "r") #string1 += fd.read() string1 = string1.replace("\xe2\x80\x9c", "\"") string1 = string1.replace("\xe2\x80\x9d", "\"") string1 = string1.replace("\xe2\x80\x99", "\'") keywordList = [] tagList = [] entityList = [] myList = string1.split("\n", size) for x in range(0, size): keywordList.append(indicoio.keywords(myList[x], top_n=10, independent=True)) tagList.append(indicoio.text_tags(myList[x], threshold=.05)) entityList.append(indicoio.named_entities(myList[x])) #print indicoio.text_tags(myList[x], threshold=.1) #print indicoio.keywords(myList[x], top_n=6, independent=True) ## build 2-d array of weights matrix = [[0 for x in range(size)] for x in range(size)] for x in range(0, size): for y in range(0, size): matrix[x][y] = 1000 * compareKeywords( keywordList[x], keywordList[y]) * compareTags( tagList[x], tagList[y]) * compareEntities( entityList[x], entityList[y]) #print str(x) + " " + str(y) + " " + str(matrix[x][y])
import indicoio # single example text = ''' С питанием связаны все жизненно важные функции организма. Оно — источник развития тканей и клеток, их постоянного обновления, насыщения человека энергией. Неправильное питание ― как избыточное, так и недостаточное ― способно нанести существенный вред здоровью человека в любом возрасте. Это выражается в снижении уровня физического и умственного развития, быстрой утомляемости, неспособности оказывать сопротивление воздействию неблагоприятных факторов окружающей среды, снижении работоспособности и даже преждевременном старении и сокращении продолжительности жизни. Кстати, каждой женщине, которая следит за собой, известно, что состояние кожи напрямую зависит от состояния пищеварительной системы, в частности кишечника, а значит, и от здорового питания. ''' sen = indicoio.sentiment(text, language="russian") a = indicoio.text_tags(text, language="russian") items = sorted(a.items(), key=lambda e: e[1], reverse=True) print(f'sentiment = {sen}') for item in items: print(f'{item[0]:30} {item[1]:1.3f}')
def indicoTags(tweet): tag_dict = indicoio.text_tags(tweet) return sorted(tag_dict.keys(), key=lambda x: tag_dict[x], reverse=True)[:3]
def execute(USERNAME, target, refresh): r_data = io_helper.read_raw(USERNAME, target) og = sys.stdout fpath = io_helper.out_path(USERNAME, target) def analysis(raw='', limit=5, text='', percent=True): global meta_dict # print lines if input is a list of non-dicts # if input is list of dicts, merge dicts and resend to analysis if isinstance(raw, list): for item in raw: if not isinstance(item, dict): print(item) else: create_meta_dict(item) analysis(meta_dict, limit, text, percent) # if input is dict: print k, v pairs # optional args for return limit and description text if isinstance(raw, dict): print(text) ct = 0 for v in sorted(raw, key=raw.get, reverse=True): ct += 1 if ct > limit: break if isinstance(raw[v], float): if percent: per = r'%' else: per = '' print(" " + v, str(round(raw[v] * 100, 2)) + per) else: print(v, raw[v]) print() def create_meta_dict(item): # merge list of dicts into master dict global meta_dict meta_dict[item['text']] = item['confidence'] return meta_dict rClean = '' for i in range(len(r_data)): if r_data[i - 1] == '\\': rClean = rClean[:-1] if r_data[i] != "'": continue if r_data[i] == '*': rClean += ' ' else: rClean += r_data[i] r_data = rClean del rClean indicoio.config.api_key = keycheck.get_key() # Big 5 big5 = { 'text': "Big 5 personality inventory matches: ", "payload": indicoio.personality(r_data) } # Meyers briggs mbtiLabels = indicoio.personas(r_data) mbti_dict = { 'architect': 'intj', 'logician': 'intp', 'commander': 'entj', 'debater': 'entp', 'advocate': 'infj', 'mediator': 'infp', 'protagonist': 'enfj', 'campaigner': 'enfp', 'logistician': 'istj', 'defender': 'isfj', 'executive': 'estj', 'consul': 'esfj', 'virtuoso': 'istp', 'adventurer': 'isfp', 'entrepreneur': 'estp', 'entertainer': 'esfp' } def replace_mbti(): for k, v in mbtiLabels.items(): k = k.replace(k, mbti_dict[k]) yield k k = (list(replace_mbti())) v = map(lambda x: x, mbtiLabels.values()) payload = (dict(zip(k, v))) mbti = { 'text': "Most likely personalilty styles: ", "payload": payload, 'ct': 5, 'percent': True } # Political pol = { 'text': "Political alignments: ", "payload": indicoio.political(r_data, version=1) } # Sentiment sen = { 'text': "Sentiment: ", "payload": { 'Percent positive': indicoio.sentiment(r_data) }, 'ct': 3 } # Emotion emo = { 'text': "Predominant emotions:", "payload": indicoio.emotion(r_data), 'ct': 5 } # Keywords kw = {'text': "Keywords: ", "payload": indicoio.keywords(r_data), 'ct': 5} # Text tags tt = { 'text': "Text tags: ", "payload": indicoio.text_tags(r_data), 'ct': 10 } # Place pla = { 'text': "Key locations: ", 'payload': indicoio.places(r_data, version=2), 'ct': 3, 'percent': True } def Karma(USERNAME): import praw import collections kList = [] user_agent = ("N2ITN") r = praw.Reddit(user_agent=user_agent) thing_limit = 100 user = r.get_redditor(USERNAME) gen = user.get_submitted(limit=thing_limit) karma_by_subreddit = {} for thing in gen: subreddit = thing.subreddit.display_name karma_by_subreddit[subreddit] = ( karma_by_subreddit.get(subreddit, 0) + thing.score) for w in sorted(karma_by_subreddit, key=karma_by_subreddit.get, reverse=True): kList.append(str(w) + ': ' + str(karma_by_subreddit[w])) kList.insert(0, 'Karma by Sub') print("\n\t".join(kList[:10])) def show(results): # Accepts bag of dicts, or single dict if not isinstance(results, dict): for X in results: show(X) else: if results == pla and pla['payload'] == []: print("Not enough information to infer place of origin") print() else: i = results analysis(raw=i.get('payload', ''), limit=i.get('ct', 5), text=i.get('text', ''), percent=i.get('percent', True)) with open(fpath, 'w') as outtie: sys.stdout = outtie print(target + USERNAME) print() show([kw, pla, big5, emo, sen, pol, mbti, tt]) Karma(USERNAME) sys.stdout = og return
def get_topics(text): if not text: return [] tag_dictionary = indicoio.text_tags(text) return [key for key in sorted(tag_dictionary.keys(), key=lambda x: tag_dictionary[x], reverse=True)]
def test_batch_texttags(self): test_data = ["On Monday, president Barack Obama will be..."] response = text_tags(test_data, api_key=self.api_key) self.assertTrue(isinstance(response, list))
def identify_keywords(text): x = indicoio.text_tags(text, threshold = 0.01, top_n = NUM_RESULTS) y = indicoio.keywords(text, threshold = 0.01, top_n = NUM_RESULTS) x.update(y) return x
def execute(USERNAME, target, refresh): r_data = io_helper.read_raw(USERNAME, target) og = sys.stdout fpath = io_helper.out_path(USERNAME, target) def analysis(raw='', limit=5, text='', percent=True): global meta_dict # print lines if input is a list of non-dicts # if input is list of dicts, merge dicts and resend to analysis if isinstance(raw, list): for item in raw: if not isinstance(item, dict): print(item) else: create_meta_dict(item) analysis(meta_dict, limit, text, percent) # if input is dict: print k, v pairs # optional args for return limit and description text if isinstance(raw, dict): print(text) ct = 0 for v in sorted(raw, key=raw.get, reverse=True): ct += 1 if ct > limit: break if isinstance(raw[v], float): if percent: per = r'%' else: per = '' print(" " + v, str(round(raw[v] * 100, 2)) + per) else: print(v, raw[v]) print() def create_meta_dict(item): # merge list of dicts into master dict global meta_dict meta_dict[item['text']] = item['confidence'] return meta_dict rClean = '' for i in range(len(r_data)): if r_data[i - 1] == '\\': rClean = rClean[:-1] if r_data[i] != "'": continue if r_data[i] == '*': rClean += ' ' else: rClean += r_data[i] r_data = rClean del rClean indicoio.config.api_key = keycheck.get_key() # Big 5 big5 = {'text': "Big 5 personality inventory matches: ", "payload": indicoio.personality(r_data)} # Meyers briggs mbtiLabels = indicoio.personas(r_data) mbti_dict = { 'architect': 'intj', 'logician': 'intp', 'commander': 'entj', 'debater': 'entp', 'advocate': 'infj', 'mediator': 'infp', 'protagonist': 'enfj', 'campaigner': 'enfp', 'logistician': 'istj', 'defender': 'isfj', 'executive': 'estj', 'consul': 'esfj', 'virtuoso': 'istp', 'adventurer': 'isfp', 'entrepreneur': 'estp', 'entertainer': 'esfp' } def replace_mbti(): for k, v in mbtiLabels.items(): k = k.replace(k, mbti_dict[k]) yield k k = (list(replace_mbti())) v = map(lambda x: x, mbtiLabels.values()) payload = (dict(zip(k, v))) mbti = {'text': "Most likely personalilty styles: ", "payload": payload, 'ct': 5, 'percent': True} # Political pol = {'text': "Political alignments: ", "payload": indicoio.political(r_data, version=1)} # Sentiment sen = {'text': "Sentiment: ", "payload": {'Percent positive': indicoio.sentiment(r_data)}, 'ct': 3} # Emotion emo = {'text': "Predominant emotions:", "payload": indicoio.emotion(r_data), 'ct': 5} # Keywords kw = {'text': "Keywords: ", "payload": indicoio.keywords(r_data), 'ct': 5} # Text tags tt = {'text': "Text tags: ", "payload": indicoio.text_tags(r_data), 'ct': 10} # Place pla = { 'text': "Key locations: ", 'payload': indicoio.places(r_data, version=2), 'ct': 3, 'percent': True } def Karma(USERNAME): import praw import collections kList = [] user_agent = ("N2ITN") r = praw.Reddit(user_agent=user_agent) thing_limit = 100 user = r.get_redditor(USERNAME) gen = user.get_submitted(limit=thing_limit) karma_by_subreddit = {} for thing in gen: subreddit = thing.subreddit.display_name karma_by_subreddit[subreddit] = (karma_by_subreddit.get(subreddit, 0) + thing.score) for w in sorted(karma_by_subreddit, key=karma_by_subreddit.get, reverse=True): kList.append(str(w) + ': ' + str(karma_by_subreddit[w])) kList.insert(0, 'Karma by Sub') print("\n\t".join(kList[:10])) def show(results): # Accepts bag of dicts, or single dict if not isinstance(results, dict): for X in results: show(X) else: if results == pla and pla['payload'] == []: print("Not enough information to infer place of origin") print() else: i = results analysis( raw=i.get('payload', ''), limit=i.get('ct', 5), text=i.get('text', ''), percent=i.get('percent', True) ) with open(fpath, 'w') as outtie: sys.stdout = outtie print(target + USERNAME) print() show([kw, pla, big5, emo, sen, pol, mbti, tt]) # Karma(USERNAME) sys.stdout = og return
def get_texttags(): if request.method == 'POST': data = dict(request.form)['data_to_analyze'] return json.dumps({ 'text_tags': sort(indicoio.text_tags(data)[0])[0:3] })