def checkText(app): train = [ ('Find a retail location near you.', 'moneypak'), ('Look for a MoneyPak in the prepaid section.', 'moneypak'), ('Take it to the cashier and load it with cash.', ' moneypak'), ('To pay fine you should enter the digits MoneyPak resulting pass in the payment form and press Pay MoneyPak.', 'moneypak'), ('Amount of fine is 500 dollars.', 'moneypak'), ('After submitting the MoneyPak voucher your fine will be redeemed in hours.', 'moneypak'), ('You can settle the fine with MoneyPak vouchers.', 'moneypak'), ('As soon as the money arrives to the Treasury account your Mobile Device will be unlocked and all information will be decrypted in course of hours.', 'moneypak'), ('Fine should be paid not later than in 12 hours since current time.', 'moneypak'), ('To unlock your device and to avoid other legal consequences you are obligated to pay a release fee of 300 dollars', 'moneypak'), ('Payable through GreenDot MoneyPak.', 'moneypak'), ('load it with 300 dollars and enter the code.', 'moneypak'), ('The penalty as a base measure of punishment on you which you are obliged to pay in a current of is imposed.', 'moneypak'), ('Click to win lottery worth ten millions.', 'moneypak'), ('This device lock is aimed to stop your illegal activity.', 'threat'), ('Your Device has been blocked up for safety reasons listed below.', 'threat'), ('All the actions performed on this device are fixed.', 'threat'), ('You have been blocked to use your device. you need to pay a fee of rupees 500 to unblock.', 'threat'), ('In less than 3 days your device data will be sent to our servers and you will be blocked to use your data. To abort this you should pay a ransom of 500 rupees.', 'threat'), ('Please mind that both your personal identities and location are well identified and criminal case can be opened against you in course of hours as of commission of crimes per above Articles.', 'threat'), ('The penalty set must be paid in course of hours as of the breach.', 'threat'), ('On expiration of the term hours that follow will be used for automatic collection of data on yourself and your misconduct and criminal case will be opened against you.', 'threat'), ('This device is locked due to the violation of the federal laws of the United States of America', 'threat'), ('The messages with terroristic motives attempts in respect of political persons and also in respect of top public officials were sent from your device', 'threat'), ('According to these data you will be permitted to pay fine in State Treasury in the consequence of initiative directed on protection of cyber space in U.S.A and in doing so to seize your clerical correspondence and taking your criminal case to court for decision formulation.', 'threat'), ('In case of penalty non-redemption your case will be taken to court for the further decision formulation and determination of your criminal case.', 'threat'), ('Seize clerical correspondence taking criminal case court decision formulation', 'threat'), ('Penalty non redemption case taken court decision formulation determination criminal', 'threat'), ('For this reason your device has been locked.', 'threat'), ("Information on your location and snaphots containing your face have been uploaded on the fbi cyber crime department's datacenter.", 'threat'), ('According to these positions your actions bear criminal character and you are a criminal subject.', 'threat'), ("If you don't adhere to the instructions provided you can be jailed under cyber crime law.", 'threat'), ("Send your phone details if you want to unlock your phone.", 'threat'), ('install', 'non-threat'), ('@string', 'non-threat'), ('The government policies have been changed', 'non-threat'), ('Under supervision of FBI.U.S.A. Ministry of Interior Interpol Copyright Alliance International Cyber Security Protection Alliance.', 'non-threat'), ('You are accused of committing the crime envisaged by Article 1 of United States of America criminal law.', 'non-threat'), ('Article 1 of United States of America criminal non-threat provides for the punishment of deprivation of liberty for terms from 5 to years.', 'non-threat'), ('Article Section Cause', 'non-threat'), ('The policies of government has been changed', 'non-threat'), ('you have been restricted by government agencies to continue', 'non-threat'), ('Article 1 Section 8 Cause 8 of the Criminal Code provides for a fine of two to five hundred minimal wages or a deprivation of liberty for two to eight years.', 'non-threat'), ('Thus violating article 2 of the Criminal Code of United States of America Article 2 of the Criminal Code provides for a deprivation of liberty for four to twelve years.', 'non-threat'), ('Spam distribution on e-mail addresses of private persons containing the advertisement of illegal drugs addictive substances or psychopharmaceuticals on the territory of the USA', 'law'), ('Disconnection or disposal of the device or your attempts to unlock the device independently will be apprehended as unapproved actions interfering the execution of the law of the united states of america (read section 1509 - obstruction of court orders and section 1510 - obstruction of criminal investigations.', 'non-threat'), ('In this case and in case of penalty non-payment in a current of three calendar days from the date of this notification. The total amount of penalty will be tripled and the respective fines will be charged to the outstanding penalty.', 'non-threat'), ('In case of dissent with the indicted prosecution you have the right to challenge it in court.', 'non-threat'), ('Distribution of false information about politicians.', 'non-threat'), ('You have been subjected to violation of Copyright and Related Rights Law (Video Music Software) and illegally using or distributing copyrighted contents', 'copyright'), ('Distribution of episodes containing abuse or p**n material with participation of animals', 'p**n'), ('Distribution and keeping of p**n material with participation of children', 'p**n'), ('You are accused of viewing and storage of forbidden child pornography and zoophilia.', 'p**n'), ('You have violated World Declaration on non-proliferation of child pornography.', 'p**n'), ('You have been viewing or distributing prohibited Pornographic content (Child Pornography photos and etc were found on your device).', 'p**n'), ('Video materials of p**n content including sexual acts with children (persons up to 9 years old).', 'p**n'), ('Video materials containing the scene of child (persons up to 9 years) and animals abuse.', 'p**n'), ('Video materials are made with the help of your device contain the scenes of cruel child and animals abuse.', 'p**n'), ('As a result of full scanning of your device some suspicious files have been found.', 'p**n'), ('Your attendance of the forbidden pornographic sites has been fixed.', 'p**n'), ('Depiction of animal cruelty.', 'p**n'), ('Whoever knowingly creates sells or possesses a depiction of animal cruelty with the intention of placing that depiction in interstate or foreign commerce for commercial gain shall be fined under this title or imprisoned not more than 5 years or both.', 'p**n'), ('Certain activities relating to material constituting or containing child pornography.', 'p**n'), ] c1 = NaiveBayesClassifier(train) path = "F:\\Apktool\\%s\\res\\layout\\" % app os.chdir(path) all_files = os.listdir(path) #print(all_files) list = [] text_list = [] for i in all_files: file = open(i, "r") st = file.read() x = re.findall(r'text=\"(.*?)\"', st, re.DOTALL) y = "".join(x).replace('\n', ' ') if (y != ''): list.append(y) #print(list) for i in list: print("Text: " + i) blob = TextBlob(i, classifier=c1) sr = blob.classify() text_list.append(sr) count = 0 #print(text_list) for i in text_list: if (i == "threat"): count = count + 1 if (count >= 1): print("THREATENING TEXT PRESENT") c = 1 if (count == 0): print("Threatening Text Not Present") c = 0 file.close() return c
def textfeatures(transcript): #alphabetical features a = transcript.count('a') b = transcript.count('b') c = transcript.count('c') d = transcript.count('d') e = transcript.count('e') f = transcript.count('f') g_ = transcript.count('g') h = transcript.count('h') i = transcript.count('i') j = transcript.count('j') k = transcript.count('k') l = transcript.count('l') m = transcript.count('m') n = transcript.count('n') o = transcript.count('o') p = transcript.count('p') q = transcript.count('q') r = transcript.count('r') s = transcript.count('s') t = transcript.count('t') u = transcript.count('u') v = transcript.count('v') w = transcript.count('w') x = transcript.count('x') y = transcript.count('y') z = transcript.count('z') space = transcript.count(' ') #numerical features and capital letters num1 = transcript.count('0') + transcript.count('1') + transcript.count( '2') + transcript.count('3') + transcript.count( '4') + transcript.count('5') + transcript.count( '6') + transcript.count('7') + transcript.count( '8') + transcript.count('9') num2 = transcript.count('zero') + transcript.count( 'one') + transcript.count('two') + transcript.count( 'three') + transcript.count('four') + transcript.count( 'five') + transcript.count('six') + transcript.count( 'seven') + transcript.count('eight') + transcript.count( 'nine') + transcript.count('ten') number = num1 + num2 capletter = sum(1 for c in transcript if c.isupper()) #part of speech text = word_tokenize(transcript) g = nltk.pos_tag(transcript) cc = 0 cd = 0 dt = 0 ex = 0 in_ = 0 jj = 0 jjr = 0 jjs = 0 ls = 0 md = 0 nn = 0 nnp = 0 nns = 0 pdt = 0 pos = 0 prp = 0 prp2 = 0 rb = 0 rbr = 0 rbs = 0 rp = 0 to = 0 uh = 0 vb = 0 vbd = 0 vbg = 0 vbn = 0 vbp = 0 vbp = 0 vbz = 0 wdt = 0 wp = 0 wrb = 0 for i in range(len(g)): if g[i][1] == 'CC': cc = cc + 1 elif g[i][1] == 'CD': cd = cd + 1 elif g[i][1] == 'DT': dt = dt + 1 elif g[i][1] == 'EX': ex = ex + 1 elif g[i][1] == 'IN': in_ = in_ + 1 elif g[i][1] == 'JJ': jj = jj + 1 elif g[i][1] == 'JJR': jjr = jjr + 1 elif g[i][1] == 'JJS': jjs = jjs + 1 elif g[i][1] == 'LS': ls = ls + 1 elif g[i][1] == 'MD': md = md + 1 elif g[i][1] == 'NN': nn = nn + 1 elif g[i][1] == 'NNP': nnp = nnp + 1 elif g[i][1] == 'NNS': nns = nns + 1 elif g[i][1] == 'PDT': pdt = pdt + 1 elif g[i][1] == 'POS': pos = pos + 1 elif g[i][1] == 'PRP': prp = prp + 1 elif g[i][1] == 'PRP$': prp2 = prp2 + 1 elif g[i][1] == 'RB': rb = rb + 1 elif g[i][1] == 'RBR': rbr = rbr + 1 elif g[i][1] == 'RBS': rbs = rbs + 1 elif g[i][1] == 'RP': rp = rp + 1 elif g[i][1] == 'TO': to = to + 1 elif g[i][1] == 'UH': uh = uh + 1 elif g[i][1] == 'VB': vb = vb + 1 elif g[i][1] == 'VBD': vbd = vbd + 1 elif g[i][1] == 'VBG': vbg = vbg + 1 elif g[i][1] == 'VBN': vbn = vbn + 1 elif g[i][1] == 'VBP': vbp = vbp + 1 elif g[i][1] == 'VBZ': vbz = vbz + 1 elif g[i][1] == 'WDT': wdt = wdt + 1 elif g[i][1] == 'WP': wp = wp + 1 elif g[i][1] == 'WRB': wrb = wrb + 1 #sentiment tblob = TextBlob(transcript) polarity = float(tblob.sentiment[0]) subjectivity = float(tblob.sentiment[1]) #word repeats words = transcript.split() newlist = transcript.split() repeat = 0 for i in range(len(words)): newlist.remove(words[i]) if words[i] in newlist: repeat = repeat + 1 featureslist = np.array([ a, b, c, d, e, f, g_, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, space, number, capletter, cc, cd, dt, ex, in_, jj, jjr, jjs, ls, md, nn, nnp, nns, pdt, pos, prp, prp2, rbr, rbs, rp, to, uh, vb, vbd, vbg, vbn, vbp, vbz, wdt, wp, wrb, polarity, subjectivity, repeat ]) return featureslist
import tweepy from textblob import TextBlob consumer_key = "tEdvNAY2zbwvRpunL8b6NZ9Fi" consumer_secret = "e0w6WTwjzVBhlbW7DJ8Y6mWXHMsTQIE9WAsq8cdjdgpxyBGfwo" access_token = "1394326038-Ss1gnEPaMGMN2AVuvtz81oVT9hjn0QxosEjVQbE" access_secret = "Qey67q8OaKaStml9FjazGTbz8jQZFlISkhov9sMgTdhV9" auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_secret) api = tweepy.API(auth) sentiment_analysis_txt = open('sentiment.txt', 'w') public_tweets = api.search('Berger to Ajah') text = "" for tweet in public_tweets: print(tweet.text) analysis = TextBlob(tweet.text) print(analysis.sentiment) sentiment_analysis_txt.write(text) sentiment_analysis_txt.close()
from textblob import TextBlob import json companies = [] with open('data.json') as jsonfile: companies = json.load(jsonfile) for company in companies: blob = TextBlob(company['purpose']) company['sentiment'] = blob.sentiment.polarity def bySentiment(c): return c['sentiment'] companies.sort(key=bySentiment) topTen = companies[-10:] bottomTen = companies[:10] print("TOP TEN") for c in topTen: print(c['name'], " -- ", c['sentiment']) print("BOTTOM TEN") for c in bottomTen: print(c['name'], " -- ", c['sentiment'])
def textblob_tokenizer(str_input): blob = TextBlob(str_input.lower()) tokens = blob.words words = [token.stem() for token in tokens if token not in updatedStopWords] return words
def spell_correction(df, desc_colname, shortdesc_colname): df['SpellCorrected_Desc'] = df[desc_colname].apply( lambda x: str(TextBlob(x).correct())) df['SpellCorrected_Short_Desc'] = df[shortdesc_colname].apply( lambda x: str(TextBlob(x).correct())) return df
from textblob.classifiers import NaiveBayesClassifier from textblob import TextBlob from Training import train from Testing import test from app import answer import sys cl = NaiveBayesClassifier(train) # Classify some text #print(cl.classify(name)) #Classify a TextBlob #blob = TextBlob("They look blessed.", classifier=cl) blob = TextBlob(answer, classifier=cl) #print(blob) #print(blob.classify()) for sentence in blob.sentences: #print(sentence) #print(sentence.classify()) feedback = sentence.classify() # Compute accuracy #print("Accuracy: {0}".format(cl.accuracy(test))) accuracy = "Accuracy: {0}".format(cl.accuracy(test)) # Show 5 most informative features features = cl.show_informative_features(5)
def update(num, line, countsCollege, sentimentsums): if (num == 0 or num == 1 or num == 2 or num == 4 or num == 6 or num == 7 or num == 12 or num == 15): countsCollege[0] += 1 sentimentsums[0] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[0]): mini[0] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[0]): maxi[0] = TextBlob(line).sentiment.polarity else: countsCollege[1] += 1 sentimentsums[1] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[1]): mini[1] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[1]): maxi[1] = TextBlob(line).sentiment.polarity if (num == 0 or num == 1 or num == 2 or num == 3 or num == 4 or num == 5 or num == 6 or num == 12 or num == 14 or num == 15): countsCollege[2] += 1 sentimentsums[2] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[2]): mini[2] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[2]): maxi[2] = TextBlob(line).sentiment.polarity else: countsCollege[3] += 1 sentimentsums[3] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[3]): mini[3] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[3]): maxi[3] = TextBlob(line).sentiment.polarity if (num == 2 or num == 5 or num == 6 or num == 13 or num == 15): countsCollege[4] += 1 sentimentsums[4] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[4]): mini[4] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[4]): maxi[4] = TextBlob(line).sentiment.polarity else: countsCollege[5] += 1 sentimentsums[5] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[5]): mini[5] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[5]): maxi[5] = TextBlob(line).sentiment.polarity if (num == 1 or num == 4 or num == 6 or num == 2 or num == 14): countsCollege[6] += 1 sentimentsums[6] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[6]): mini[6] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[6]): maxi[6] = TextBlob(line).sentiment.polarity else: countsCollege[7] += 1 sentimentsums[7] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[7]): mini[7] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[7]): maxi[7] = TextBlob(line).sentiment.polarity
import tweepy from textblob import TextBlob wiki = TextBlob("Vivek is always angry beacuse he can't manage his time") # print(wiki.tags) #Parts of speech # print(wiki.words) #Tokenize print(wiki.sentiment) consumer_key = 'o5CbrDAJkpCLBhHTsu3YkSsvN' consumer_secret = '2irncRv189vQTBMF3qAO5vwO4LpEHT29rH8r3nagzzvNt9IEEQ' access_token = '2996486912-b7NCHNfnISl5fsXVO0OLH4Dl7NyfnXCtxwTgsUh' access_token_secret = ' 9KJksG6vLknQs80MimZvHVoiAuYkeGaXrtUxL8Sulxkeg' auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) public_tweets = api.search('Trump') for tweet in public_tweets: print(tweet.text) analysis = TextBlob(tweet.text) print(analysis.sentiment) print("")
def getPolarity(text): return TextBlob(text).sentiment.polarity
num = 11 elif (wordFinder("@oursoutheastern", line)): num = 12 elif (wordFinder("@Grambling1901", line)): num = 13 elif (wordFinder("@SouthernU_BR", line)): num = 14 elif (wordFinder("@nsula", line)): num = 15 elif (wordFinder("@LA_College", line)): num = 16 elif (wordFinder("@NichollsState", line)): num = 17 tweets_per_college[num] += 1 college_sentiment_sum[num] += TextBlob(line).sentiment.polarity update(num, line, countsCollege, sentimentsums) for t in range(0, 8): sentimentsums[t] = sentimentsums[t] / countsCollege[t] for t in range(0, 18): college_sentiment_sum[t] = college_sentiment_sum[t] / tweets_per_college[t] #prints sentiment averages for different factors: popn, rank, region, followers on twitter '''for s,c,l,h in zip (sentimentsums, countsCollege, mini, maxi): print ('%.3f %d' + str(l).rjust(5) + str(h).rjust(5)) % (s, c)''' print 'College'.rjust(25) + 'No. of tweets'.rjust( 15) + 'Average sentiment score'.rjust(30) for n, t, s in zip(college, tweets_per_college, college_sentiment_sum):
def getSubjectivity(text): return TextBlob(text).sentiment.subjectivity
j += 1 k = 0 with open('output.csv', 'wb') as c: writer = csv.writer(c) writer.writerow(['Word', 'Count', 'Sentence', 'Splice', 'Polarity', 'Sentence Pol', 'Subjectivity', 'Avg Polarity', 'Avg Whole Pol', 'Avg Subjectivity','Location']) while k < SIZE: polarSum = 0 subjectSum = 0 wholeSum = 0 for spot in topWords[k].getSentenceArray(): splice = getSplice(spot) whole = sentences[spot[0]] wholePol = TextBlob(whole.decode('utf-8')).polarity polarity = TextBlob(splice.decode('utf-8')).polarity subjectivity = TextBlob(splice.decode('utf-8')).subjectivity polarSum += polarity subjectSum += subjectivity wholeSum += wholePol writer.writerow([str(topWords[k].getWord()), str(topWords[k].getCount()), sentences[spot[0]], str(splice), str(polarity), str(wholePol), str(subjectivity)]) topWords[k].setAvgPol(polarSum/topWords[k].getCount()) topWords[k].setAvgSub(subjectSum/topWords[k].getCount()) writer.writerow([" ", " ", " ", " ", " ", " ", " ", str(polarSum/topWords[k].getCount()), str(wholeSum/topWords[k].getCount()), str(subjectSum/topWords[k].getCount()), str(topWords[k].getSentenceArray())]) k += 1 c.close()
def main(): # input_filepath = "/Users/shenjiaming/Desktop/local-embedding/SegPhrase/small/linked_results.wiki.txt" # output_filepath = "/Users/shenjiaming/Desktop/local-embedding/SegPhrase/small/linked_results.wiki.pos.tsv" input_filepath = "linked_results.wiki.txt" output_filepath = "linked_results.wiki.pos.tsv" start = time.time() np_phrase_cnt = 0 phrase_only = True with open(input_filepath, "r") as fin, open(output_filepath, "w") as fout: cnt = 0 fout.write("\t".join([ "Phrase", "Combined Score", "Phrase Quality Score", "Wiki Linking Score", "NP Count Score", "\n" ])) for line in fin: cnt += 1 if cnt % 1000 == 0: print(cnt) line = line.strip() segs = line.split("\t") phrase = segs[0] phrase_quality_score = float(segs[-1]) try: wiki_score = int(segs[1]) np_cnt_score = len(TextBlob(phrase).noun_phrases) except (ValueError, UnicodeDecodeError) as e: # import ipdb; ipdb.set_trace(); continue combined_score = math.sqrt(phrase_quality_score * (wiki_score + 1) * (np_cnt_score + 1)) fout.write("\t".join([ "_".join(phrase.split()), str(combined_score), str(phrase_quality_score), str(wiki_score), str(np_cnt_score), "\n" ])) # # # if score > 0 and phrase_quality_score >= 0.5: # if phrase_only: # fout.write("_".join(phrase.split()) + "\n") # else: # fout.write("_".join(phrase.split()) + "\t" + str(score) + "\t" + str(phrase_quality_score) + "\n") # # # if score != 0: # fout.write(line+"\n") # else: # deal with noun_phrase # tmp = TextBlob(phrase) # if len(tmp.noun_phrases) == 0: # fout.write(line+"\n") # still zero # else: # np_phrase_cnt += 1 # nps = str("|".join([ele for ele in tmp.noun_phrases])) # fout.write(phrase+"\t"+"0.5"+"\t"+nps+"\t"+segs[-1]+"\n") end = time.time() print("Number of additional noun phrases: %s" % np_phrase_cnt) print("Finish using POS Tagger for NP extraction using %s seconds" % (end - start))
def getSentiment(self): text = self.title #print("text: ", text.encode("utf-8")) #text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split()) #print(text) self.analysis = TextBlob(text)
def getsent(st): if isinstance(st, str): t = TextBlob(st) return t.sentiment.polarity else: return 0
def tokenize(texts): results = [unicode(text, 'utf-8').lower() for text in texts] tests = [TextBlob(word) for word in results] return [[word.lemmatize() for word in test.words if word not in STOPWORDS] for test in tests]
n = 6000 train_n = 5000 test_n = 1000 allwords = re.findall('\w+', open(sys.argv[1]).read()) word_list = Counter(allwords).most_common(n) m = open(sys.argv[6], "r") tags = {} for line in m: pair = line.split('\t') tags[pair[0]] = pair[1].rstrip() m.close() f1 = open(sys.argv[2], "w") f2 = open(sys.argv[3], "w") source = sys.argv[4] target = sys.argv[5] count = 0 for word in word_list: word_map = TextBlob(word[0]).translate(from_lang=source, to=target) #tag = tags[TextBlob(word[0]).tags[0][1]] word_pair = (word[0].rstrip() + " " + word_map.string + "\n") count = count + 1 if count <= train_n: f1.write(word_pair.encode('utf8')) else: f2.write(word_pair.encode('utf8')) f1.close() f2.close()
import csv from textblob import TextBlob csvData = [] with open('testData.csv', 'r') as file: reader = csv.reader(file) for row in reader: csvData.append(row) with open('testData_result.csv', 'w') as file: for data in csvData: analysisPol = TextBlob(str(data)).polarity analysisSub = TextBlob(str(data)).subjectivity writer = csv.writer(file, delimiter=',') writer.writerow([analysisPol, data])
welcome = sys.argv[1] else: welcome = "How are you, Coco?" runTime = 60 ;# seconds startTime = time.time() while 1: if initialGreetings == 0: cocoBot(welcome) else: cocoBot(response) response = raw_input() chechLanguage(response) responseMsg = TextBlob(response) if initialGreetings != 1: cocoAssignsAvatar() initialGreetings = 1 print("After cocoAssignsAvatar()") itsTimeForBye = 0 for word in responseMsg.words: if word.lower() in USER_INIT_BYE: itsTimeForBye = 1 elapsed = time.time() - startTime if elapsed >= runTime : cocoWantsABreak("cocoInitBye") response = raw_input(str(user_avatar) + " >> ")
import os import re import logging import time from operator import add from textblob import TextBlob # importation de textblob outil liguistique from nltk.corpus import stopwords from P2N_Lib import LoadBiblioFile, GenereListeFichiers from P2N_Config import LoadConfig configFile = LoadConfig() import codecs requete = configFile.requete projectName = configFile.ndf phrase = "invention relates to food industry, namely to production of granular caviar from hydrobionts, which has a high biological activity. Method for obtaining edible granular caviar from Artemia's gonad cancer includes cleaning the cysts by decapsulating them, drying the purified kernels of Artemia caviar to a residual moisture content of not more than 5-10 % by weight, at which the layers of the product are formed in polymer bags and processed by a stream of accelerated electrons obtained in a pulsed linear electron accelerator with an accelerated electron energy of 2.5-5 MeV and an absorbed radiation dose of not more than 20 kGy. Prior to formation of food product layers in polymer bags for irradiation with accelerated electrons, organoleptic and/or preservative additives are additionally added thereto at the following quantitative content of the components, % by weight: organoleptic and/or preservative additives 3.0-30.0; decapsulated cysts of Artemia crustaceans - the rest is up to 100 %.EFFECT: proposed method of obtaining food caviar provides for the expansion of the raw material base for the production of granular caviar, as well as production of granular caviar with new higher nutritional, biologically active and organoleptic properties.1 cl, 1 tbl, 10 ex" phraseBlob = TextBlob(phrase) sentences = phraseBlob.words model = Word2Vec(sentences, min_count=1) # summarize the loaded model print(model) # summarize vocabulary words = list(model.wv.vocab) print(words) # access vector for one word ndf = projectName BiblioPath = configFile.ResultBiblioPath ResultBiblioPath = configFile.ResultBiblioPath temporPath = configFile.temporPath ResultPathContent = configFile.ResultContentsPath ResultAbstractPath = configFile.ResultAbstractPath
# Parse the Status objects dates = [] polarities = [] for s in statuses: # Uncomment below to print the contents of the tweets status_text = s.text status_time = s.created_at # print '\n' + status_time # print s.text fav_count = s.favorite_count retweet_count = s.retweet_count # print "Favorite Count: " + str(fav_count) # print "Retweet Count" + retweet_count # Run sentiment analysis using TextBlob tb = TextBlob(status_text) status_polarity = tb.sentiment.polarity polarities.append(status_polarity) # Parse and format the date/time of the tweet split_time = status_time.split(" ") dt = datetime.datetime(int(split_time[5]), monthmap[split_time[1]], int(split_time[2]), 0, 0) dates.append(dt) # Create numpy arrays for dates and polarities of the tweets date_array = np.array([dt for dt in dates]) polarities_array = np.array(polarities) # Aggregate tweets that are on the same date and take average polarity
# -*- coding: utf-8 -*- """ Código fuente de ejemplos y ejercicios del libro "Curso de Programación Python" (C) Ediciones Anaya Multimedia 2019 Autores: Arturo Montejo Ráez y Salud María Jiménez Zafra """ from textblob import TextBlob text = ''' The titular threat of The Blob has always struck me as the ultimate movie monster: an insatiably hungry, amoeba-like mass able to penetratevirtually any safeguard, capable of--as a doomed doctor chillingly describes it--"assimilating flesh on contact. Snide comparisons to gelatin be damned, it's a concept with the most devastating of potential consequences, not unlike the grey goo scenario proposed by technological theorists fearful of artificial intelligence run rampant. ''' blob = TextBlob(text) print("Etiquetas: ", blob.tags) print("Sintagmas nominales:", blob.noun_phrases) print("Polaridad:") for sentence in blob.sentences: print(sentence, " = ", sentence.sentiment.polarity)
def __call__(self, text): return set(TextBlob(text).words.lemmatize().lower()).intersection(self.words)
def analyse_sentiment_get(): sentence = request.args.get('sentence') polarity = TextBlob(sentence).sentences[0].polarity return str(polarity)
twt = pd.read_csv('twitter training data.csv', encoding = 'latin-1') twt.head() twt = twt.iloc[:1000] #nltk.download() # Sentiment analysis using Text Blob # Creating empty dataframe to store results FinalResults = pd.DataFrame() # Run Engine for i in range(0, twt.shape[0]): blob = TextBlob(twt.iloc[i,5]) temp = pd.DataFrame({'Tweets': twt.iloc[i,5], 'Polarity': blob.sentiment.polarity}, index = [0]) FinalResults = FinalResults.append(temp) FinalResults['Sentiment'] = FinalResults['Polarity'].apply(lambda x: 'Positive' if x>0 else 'Negative' if x<0 else 'Neutral') FinalResults['Sentiment'].describe() #Results: Most of the tweets are Neutral # Sentiment Analysis using Vader FinalResults_Vader = pd.DataFrame()
def on_data(self, data): # decode json dict_data = json.loads(data) text=dict_data["text"] print text link=re.findall(r'http[^ ]*',text) print link text=text.lower() es.index(index='temp', doc_type='temp', id=1, refresh=True, body={ "message": text }) result=es.search_exists( index='temp', doc_type='temp', body={ 'query':{'bool':{'should':[{'terms':{ 'message':['loans','grant','grants','loan','pay','budget','debt','money', 'save','spend','invest','tax','taxes','dollar','apply','application','paid','credit','interest','bank','debtor','repay','borrow','lend','lender','federal','fund','funding']} }],'minimum_should_match':1}} }) if result==True: #If the incoming tweet matches the query print "MATCH!" text=re.sub(r'rt |RT ','',text) text=re.sub(r'&','',text) text=re.sub(r'http[^ ]*','',text) text=re.sub(r'#','',text) text=re.sub(r'@[a-zA-Z0-9]*','',text) text=re.sub(r'\'','',text) text=re.sub(r'[^a-zA-Z0-9 ]',' ',text) #text=nltk.word_tokenize(text) #text=[word for word in text if word.lower() not in stopwords.words("english")] #text=[st.stem(word) for word in text] #text=' '.join(text) tweet = TextBlob(dict_data["text"]) # determine if sentiment is positive, negative, or neutral if tweet.sentiment.polarity < 0: sentiment = "negative" elif tweet.sentiment.polarity == 0: sentiment = "neutral" else: sentiment = "positive" print datetime.datetime.now() print text if 'http' in dict_data["text"]: #if the tweet contains a link print 'contains link' es.index(index="stream", doc_type="SBA", body={"user": dict_data["user"]["screen_name"], 'date': datetime.datetime.now(), "message": text, "full message": dict_data["text"], # "url": dict_data["urls"]["expanded_url"], "polarity": tweet.sentiment.polarity, "subjectivity": tweet.sentiment.subjectivity, "sentiment": sentiment, "link": link, "link_processed":'no', "article_title": '', "article_url": '', "article_text": '', 'flag':'' }) else: es.index(index="stream", doc_type="SBA", body={"user": dict_data["user"]["screen_name"], 'date': datetime.datetime.now(), "message": text, "full message": dict_data["text"], # "url": dict_data["urls"]["expanded_url"], "polarity": tweet.sentiment.polarity, "subjectivity": tweet.sentiment.subjectivity, "sentiment": sentiment, "link": "no link", "link_processed":'yes', "article_title": '', "article_url": '', "article_text": '', 'flag':'' }) return True
def spacy_featurize(transcript): nlp=spacy.load('en_core_web_sm') doc=nlp(transcript) # initialize lists entity_types=['PERSON','NORP','FAC','ORG', 'GPE','LOC','PRODUCT','EVENT', 'WORK_OF_ART','LAW','LANGUAGE', 'DATE','TIME','PERCENT','MONEY', 'QUANTITY','ORDINAL','CARDINAL'] pos_types=['PROPN', 'ADP', 'DET', 'NUM', 'PUNCT', 'SPACE', 'VERB', 'NOUN', 'ADV', 'CCONJ', 'PRON', 'ADJ', 'SYM', 'PART', 'INTJ', 'X'] tag_types=['NNP', 'IN', 'DT', 'CD', 'NNPS', ',', '_SP', 'VBZ', 'NN', 'RB', 'CC', '', 'NNS', '.', 'PRP', 'MD', 'VB', 'HYPH', 'VBD', 'JJ', ':', '-LRB-', '$', '-RRB-', 'VBG', 'VBN', 'NFP', 'RBR', 'POS', 'VBP', 'RP', 'JJS', 'PRP$', 'EX', 'JJR', 'WP', 'WDT', 'TO', 'WRB', "''", '``', 'PDT', 'AFX', 'RBS', 'UH', 'WP$', 'FW', 'XX', 'SYM', 'LS', 'ADD'] dep_types=['compound', 'ROOT', 'prep', 'det', 'pobj', 'nummod', 'punct', '', 'nsubj', 'advmod', 'cc', 'conj', 'aux', 'dobj', 'nmod', 'acl', 'appos', 'npadvmod', 'amod', 'agent', 'case', 'intj', 'prt', 'pcomp', 'ccomp', 'attr', 'dep', 'acomp', 'poss', 'auxpass', 'expl', 'mark', 'nsubjpass', 'quantmod', 'advcl', 'relcl', 'oprd', 'neg', 'xcomp', 'csubj', 'predet', 'parataxis', 'dative', 'preconj', 'csubjpass', 'meta'] shape_types=['\ufeffXxx', 'Xxxxx', 'XXxxx', 'xx', 'X', 'Xxxx', 'Xxx', ',', '\n\n', 'xXxxx', 'xxx', 'xxxx', '\n', '.', ' ', '-', 'xxx.xxxx.xxx', '\n\n\n', ':', '\n ', 'dddd', '[', '#', 'dd', ']', 'd', 'XXX-d', '*', 'XXXX', 'XX', 'XXX', '\n\n\n\n', 'Xx', '\n\n\n ', '--', '\n\n ', ' ', ' ', ' ', "'x", 'x', 'X.', 'xxx--', ';', 'Xxx.', '(', ')', "'", '“', '”', 'Xx.', '!', "'xx", 'xx!--Xxx', "x'xxxx", '?', '_', "x'x", "x'xx", "Xxx'xxxx", 'Xxxxx--', 'xxxx--', '--xxxx', 'X--', 'xx--', 'xxxx”--xxx', 'xxx--“xxxx', "Xxx'x", ';--', 'xxx--_xxx', "xxx'x", 'xxx!--xxxx', 'xxxx?--_Xxx', "Xxxxx'x", 'xxxx--“xxxx', "xxxx'xxx", '--Xxxxx', ',--', '?--', 'xx--“xx', 'xx!--X', '.--', 'xxx--“xxx', ':--', 'Xxxxx--“xxxx', 'xxxx!--xxxx', 'xx”--xxx', 'xxxx--_xxx', 'xxxx--“xxx', '--xx', '--X', 'xxxx!--Xxx', '--xxx', 'xxx_.', 'xxxx--_xx', 'xxxx--_xx_xxxx', 'xx!--xxxx', 'xxxx!--xx', "X'xx", "xxxx'x", "X_'x", "xxx'xxx", '--Xxxx', "X'Xxxxx", "Xx'xxxx", '--Xxx', 'xxxx”--xxxx', 'xxxx!--', 'xxxx--“x', 'Xxxx!--Xxxx', 'xxx!--Xxx', 'Xxxxx.', 'xxxx_.', 'xx--“Xxxx', '\n\n ', 'Xxxxx”--xxx', 'xxxx”--xx', 'xxxx--“xx', "Xxxxx!--Xxx'x", "X'xxxx", 'Xxxxx?--', '--Xx', 'xxxx!”--Xx', "xxxx--“X'x", "xxxx'", 'xxx.--“Xxxx', 'xxxx--“X', 'xxxx!--X', 'Xxx”--xx', 'xxx”--xxx', 'xxx-_xxx', "x'Xxxxx", 'Xxxxx!--X', 'Xxxxx!--Xxx', 'dd-d.xxx', 'xxxx://xxx.xxxx.xxx/d/dd/', 'xXxxxx', 'xxxx://xxxx.xxx/xxxx', 'd.X.', '/', 'd.X.d', 'd.X', '%', 'Xd', 'xxxx://xxx.xxxx.xxx', 'ddd(x)(d', 'X.X.', 'ddd', '*****@*****.**', 'xxxx://xxxx.xxx', '$', 'd,ddd'] chunkdep_types=['ROOT', 'pobj', 'nsubj', 'dobj', 'conj', 'appos', 'attr', 'nsubjpass', 'dative', 'pcomp'] # initialize lists features=list() labels=list() poslist=list() taglist=list() deplist=list() shapelist=list() sentences=list() sentence_length=0 sent_polarity=list() sent_subjectivity=list() # EXTRACT ALL TOKENS for token in doc: if token.pos_ in pos_types: poslist.append(token.pos_) else: poslist.append('pos_other') if token.tag_ in tag_types: taglist.append(token.tag_) else: taglist.append('tag_other') if token.dep_ in dep_types: deplist.append(token.dep_) else: deplist.append('dep_other') if token.shape_ in shape_types: shapelist.append(token.shape_) else: shapelist.append('shape_other') pos_types.append('pos_other') tag_types.append('tag_other') dep_types.append('dep_other') shape_types.append('shape_other') # count unique instances throughout entire tokenization # keep labels as well for i in range(len(pos_types)): features.append(poslist.count(pos_types[i])) labels.append(pos_types[i]) for i in range(len(tag_types)): features.append(taglist.count(tag_types[i])) labels.append(tag_types[i]) for i in range(len(dep_types)): features.append(deplist.count(dep_types[i])) labels.append(dep_types[i]) for i in range(len(shape_types)): features.append(shapelist.count(shape_types[i])) labels.append(shape_types[i]) # EXTRACT SENTENCES for sent in doc.sents: sentences.append(sent.text) # NOW ITERATE OVER SENTENCES TO CALCULATE THINGS PER SENTENCE for i in range(len(sentences)): sent_polarity.append(TextBlob(sentences[i]).sentiment[0]) sent_subjectivity.append(TextBlob(sentences[i]).sentiment[1]) # STATISTICAL POLARITY AND SUBJECTIVITY FEATURES PER SENTENCE sent_polarity=stats(np.array(sent_polarity)) for i in range(len(sent_polarity)): features.append(sent_polarity[i]) if i == 0: labels.append('mean sentence polarity') elif i == 1: labels.append('std sentence polarity') elif i == 2: labels.append('max sentence polarity') elif i == 3: labels.append('min sentence polarity') elif i == 4: labels.append('median sentence polarity') sent_subjectivity=stats(np.array(sent_subjectivity)) for i in range(len(sent_subjectivity)): features.append(sent_subjectivity[i]) if i ==0: labels.append('mean sentence subjectivity') elif i==1: labels.append('std sentence subjectivity') elif i==2: labels.append('max sentence subjectivity') elif i==3: labels.append('min sentence subjectivity') elif i==4: labels.append('median sentence subjectivity') # CHARACTERS characters=len(transcript) features.append(characters) labels.append('character count') # TOTAL NUMBER OF WORDS words=len(transcript.split()) features.append(words) labels.append('word count') # TOTAL NUMBER OF SENTENCES sentence_num=len(sentences) features.append(sentence_num) labels.append('sentence number') # WORDS PER SENTENCE wps=sentence_num/words features.append(wps) labels.append('words per sentence') # NEED TO GET MORE FEATURES #_________________________ # EXTRACT NOUN CHUNKS chunktext=list() chunkroot=list() chunkdep=list() chunkhead=list() for chunk in doc.noun_chunks: if chunk.text not in chunk.text: chunktext.append(chunk.text) #print('text:'+chunk.text) if chunk.root.text not in chunkroot: chunkroot.append(chunk.root.text) # later extract chunkdep chunkdep.append(chunk.root.dep_) if chunk.root.head.text not in chunkhead: chunkhead.append(chunk.root.head.text) features.append(len(chunktext)) labels.append('unique chunk noun text') features.append(len(chunkroot)) labels.append('unique chunk root text') features.append(len(chunkhead)) labels.append('unique chunk root head text') for i in range(len(chunkdep_types)): features.append(chunkdep.count(chunkdep_types[i])) labels.append('chunkdep '+chunkdep_types[i]) # EXTRACT NAMED ENTITY FREQUENCIES ent_texts=list() ent_labels=list() for ent in doc.ents: ent_texts.append(ent.text) ent_labels.append(ent.label_) features.append(len(ent_texts)) labels.append('number of named entities') for i in range(len(entity_types)): features.append(ent_labels.count(entity_types[i])) labels.append(entity_types[i]) return features, labels
#results = pd.DataFrame(columns=['Comment_ID', 'Body', 'Controversiality', 'Comment_Date', # 'Comment_Score', 'Polarity', 'Subjectivity', # 'Author', 'Author_flair_text', 'Author_LKarma', 'Author_CKarma', 'Author_Date', # 'Submission_ID', 'Submission_title', 'Submission_Date', # 'Submission_Title_Polarity', 'Submission_Title_Subjectivity', # 'Submission_Score', 'Submission_Author', 'Submission_Author_LKarma', # 'Submission_Author_CKarma', 'Submission_Author_Date', 'Subreddit', # ]) kafka = KafkaClient(["localhost:9092", "localhost:9093"]) producer = SimpleProducer(kafka) i = len(results) for comment_tracker in subreddit.stream.comments(): comment_sentiment = TextBlob(comment_tracker.body).sentiment thread_title_sentimet = TextBlob(comment_tracker.submission.title).sentiment if comment_sentiment[0] < -0.2: print("---------------------------------") print("Found a negative comment") print("Author: ", comment_tracker.author) print("Body: ", comment_tracker.body) print("Comment Karma: ", comment_tracker.author.comment_karma) results.loc[i, 'Comment_ID'] = comment_tracker.id results.loc[i, 'Body'] = comment_tracker.body results.loc[i, 'Controversiality'] = comment_tracker.controversiality results.loc[i, 'Comment_Date'] = comment_tracker.created_utc results.loc[i, 'Comment_Score'] = comment_tracker.score results.loc[i, 'Polarity'] = comment_sentiment[0]
# -- Sentiment Analysis -- # sub_df = pd.read_csv( "/Users/FCRA/Desktop/ALL/BSI/bsi-reddit-gme/pyfiles/sub_df.csv") sent_df = sub_df[["created", "author", "title"]] # already preproc titles sub_df2 = pd.read_csv( "/Users/FCRA/Desktop/ALL/BSI/bsi-reddit-gme/sentiment_files/preproc_titles.csv" ).reset_index(drop=True) sent_df["ptitle"] = sub_df2["title"] # --- General Sentiment of Titles with TextBlob sent_df["polarity_textBlob"] = sent_df["ptitle"].apply( lambda x: TextBlob(x).sentiment.polarity) sent_df["created"] = pd.to_datetime(sent_df["created"]).dt.floor('d') daily_sent_df_textBlob = sent_df[["created", "polarity_textBlob" ]].groupby(["created"], as_index=False).mean() daily_sent_df_textBlob["z_polarity_textBlob"] = daily_sent_df_textBlob[ "polarity_textBlob"] / daily_sent_df_textBlob["polarity_textBlob"].std( axis=0) #sent_df[["ptitle", "polarity_textBlob"]].to_csv("titles_textblob.csv") # --- Sentiment using Vader and styled lexicon vader = SentimentIntensityAnalyzer() vader.lexicon.update(new_words)