def checkText(app):

    train = [
        ('Find a retail location near you.', 'moneypak'),
        ('Look for a MoneyPak in the prepaid section.', 'moneypak'),
        ('Take it to the cashier and load it with cash.', ' moneypak'),
        ('To pay fine you should enter the digits MoneyPak resulting pass in the payment form and press Pay MoneyPak.',
         'moneypak'),
        ('Amount of fine is 500 dollars.', 'moneypak'),
        ('After submitting the MoneyPak voucher your fine will be redeemed in  hours.',
         'moneypak'),
        ('You can settle the fine with MoneyPak vouchers.', 'moneypak'),
        ('As soon as the money arrives to the Treasury account your Mobile Device will be unlocked and all information will be decrypted in course of  hours.',
         'moneypak'),
        ('Fine should be paid not later than in 12 hours since current time.',
         'moneypak'),
        ('To unlock your device and to avoid other legal consequences you are obligated to pay a release fee of 300 dollars',
         'moneypak'),
        ('Payable through GreenDot MoneyPak.', 'moneypak'),
        ('load it with 300 dollars and enter the code.', 'moneypak'),
        ('The penalty as a base measure of punishment on you which you are obliged to pay in a current of is imposed.',
         'moneypak'),
        ('Click to win lottery worth ten millions.', 'moneypak'),
        ('This device lock is aimed to stop your illegal activity.', 'threat'),
        ('Your Device has been blocked up for safety reasons listed below.',
         'threat'),
        ('All the actions performed on this device are fixed.', 'threat'),
        ('You have been blocked to use your device. you need to pay a fee of rupees 500 to unblock.',
         'threat'),
        ('In less than 3 days your device data will be sent to our servers and you will be blocked to use your data. To abort this you should pay a ransom of 500 rupees.',
         'threat'),
        ('Please mind that both your personal identities and location are well identified and criminal case can be opened against you in course of  hours as of commission of crimes per above Articles.',
         'threat'),
        ('The penalty set must be paid in course of  hours as of the breach.',
         'threat'),
        ('On expiration of the term hours that follow will be used for automatic collection of data on yourself and your misconduct and criminal case will be opened against you.',
         'threat'),
        ('This device is locked due to the violation of the federal laws of the United States of America',
         'threat'),
        ('The messages with terroristic motives attempts in respect of political persons and also in respect of top public officials were sent from your device',
         'threat'),
        ('According to these data you will be permitted to pay fine in State Treasury in the consequence of initiative directed on protection of cyber space in U.S.A and in doing so to seize your clerical correspondence and taking your criminal case to court for decision formulation.',
         'threat'),
        ('In case of penalty non-redemption your case will be taken to court for the further decision formulation and determination of your criminal case.',
         'threat'),
        ('Seize clerical correspondence taking criminal case court decision formulation',
         'threat'),
        ('Penalty non redemption case taken court decision formulation determination criminal',
         'threat'),
        ('For this reason your device has been locked.', 'threat'),
        ("Information on your location and snaphots containing your face have been uploaded on the fbi cyber crime department's datacenter.",
         'threat'),
        ('According to these positions your actions bear criminal character and you are a criminal subject.',
         'threat'),
        ("If you don't adhere to the instructions provided you can be jailed under cyber crime law.",
         'threat'),
        ("Send your phone details if you want to unlock your phone.",
         'threat'),
        ('install', 'non-threat'),
        ('@string', 'non-threat'),
        ('The government policies have been changed', 'non-threat'),
        ('Under supervision of FBI.U.S.A. Ministry of Interior Interpol Copyright Alliance International Cyber Security Protection Alliance.',
         'non-threat'),
        ('You are accused of committing the crime envisaged by Article 1 of United States of America criminal law.',
         'non-threat'),
        ('Article 1 of United States of America criminal non-threat provides for the punishment of deprivation of liberty for terms from 5 to  years.',
         'non-threat'),
        ('Article Section Cause', 'non-threat'),
        ('The policies of government  has been changed', 'non-threat'),
        ('you have been restricted by government agencies to continue',
         'non-threat'),
        ('Article 1 Section 8 Cause 8 of the Criminal Code provides for a fine of two to five hundred minimal wages or a deprivation of liberty for two to eight years.',
         'non-threat'),
        ('Thus violating article 2 of the Criminal Code of United States of America Article 2 of the Criminal Code provides for a deprivation of liberty for four to twelve years.',
         'non-threat'),
        ('Spam distribution on e-mail addresses of private persons containing the advertisement of illegal drugs addictive substances or psychopharmaceuticals on the territory of the USA',
         'law'),
        ('Disconnection or disposal of the device or your attempts to unlock the device independently will be apprehended as unapproved actions interfering the execution of the law of the united states of america (read section 1509 - obstruction of court orders and section 1510 - obstruction of criminal investigations.',
         'non-threat'),
        ('In this case and in case of penalty non-payment in a current of three calendar days from the date of this notification. The total amount of penalty will be tripled and the respective fines will be charged to the outstanding penalty.',
         'non-threat'),
        ('In case of dissent with the indicted prosecution you have the right to challenge it in court.',
         'non-threat'),
        ('Distribution of false information about politicians.', 'non-threat'),
        ('You have been subjected to violation of Copyright and Related Rights Law (Video Music Software) and illegally using or distributing copyrighted contents',
         'copyright'),
        ('Distribution of episodes containing abuse or p**n material with participation of animals',
         'p**n'),
        ('Distribution and keeping of p**n material with participation of children',
         'p**n'),
        ('You are accused of viewing and storage of forbidden child pornography and zoophilia.',
         'p**n'),
        ('You have violated World Declaration on non-proliferation of child pornography.',
         'p**n'),
        ('You have been viewing or distributing prohibited Pornographic content (Child Pornography photos and etc were found on your device).',
         'p**n'),
        ('Video materials of p**n content including sexual acts with children (persons up to 9 years old).',
         'p**n'),
        ('Video materials containing the scene of child (persons up to 9 years) and animals abuse.',
         'p**n'),
        ('Video materials are made with the help of your device contain the scenes of cruel child and animals abuse.',
         'p**n'),
        ('As a result of full scanning of your device	 some suspicious files have been found.',
         'p**n'),
        ('Your attendance of the forbidden pornographic sites has been fixed.',
         'p**n'),
        ('Depiction of animal cruelty.', 'p**n'),
        ('Whoever knowingly creates sells or possesses a depiction of animal cruelty with the intention of placing that depiction in interstate or foreign commerce for commercial gain shall be fined under this title or imprisoned not more than 5 years or both.',
         'p**n'),
        ('Certain activities relating to material constituting or containing child pornography.',
         'p**n'),
    ]

    c1 = NaiveBayesClassifier(train)

    path = "F:\\Apktool\\%s\\res\\layout\\" % app
    os.chdir(path)

    all_files = os.listdir(path)
    #print(all_files)
    list = []
    text_list = []
    for i in all_files:
        file = open(i, "r")
        st = file.read()
        x = re.findall(r'text=\"(.*?)\"', st, re.DOTALL)
        y = "".join(x).replace('\n', ' ')
        if (y != ''):
            list.append(y)
    #print(list)
    for i in list:
        print("Text: " + i)
        blob = TextBlob(i, classifier=c1)
        sr = blob.classify()
        text_list.append(sr)
    count = 0
    #print(text_list)
    for i in text_list:
        if (i == "threat"):
            count = count + 1
    if (count >= 1):
        print("THREATENING TEXT PRESENT")
        c = 1
    if (count == 0):
        print("Threatening Text Not Present")
        c = 0

    file.close()
    return c
def textfeatures(transcript):
    #alphabetical features
    a = transcript.count('a')
    b = transcript.count('b')
    c = transcript.count('c')
    d = transcript.count('d')
    e = transcript.count('e')
    f = transcript.count('f')
    g_ = transcript.count('g')
    h = transcript.count('h')
    i = transcript.count('i')
    j = transcript.count('j')
    k = transcript.count('k')
    l = transcript.count('l')
    m = transcript.count('m')
    n = transcript.count('n')
    o = transcript.count('o')
    p = transcript.count('p')
    q = transcript.count('q')
    r = transcript.count('r')
    s = transcript.count('s')
    t = transcript.count('t')
    u = transcript.count('u')
    v = transcript.count('v')
    w = transcript.count('w')
    x = transcript.count('x')
    y = transcript.count('y')
    z = transcript.count('z')
    space = transcript.count(' ')

    #numerical features and capital letters
    num1 = transcript.count('0') + transcript.count('1') + transcript.count(
        '2') + transcript.count('3') + transcript.count(
            '4') + transcript.count('5') + transcript.count(
                '6') + transcript.count('7') + transcript.count(
                    '8') + transcript.count('9')
    num2 = transcript.count('zero') + transcript.count(
        'one') + transcript.count('two') + transcript.count(
            'three') + transcript.count('four') + transcript.count(
                'five') + transcript.count('six') + transcript.count(
                    'seven') + transcript.count('eight') + transcript.count(
                        'nine') + transcript.count('ten')
    number = num1 + num2
    capletter = sum(1 for c in transcript if c.isupper())

    #part of speech
    text = word_tokenize(transcript)
    g = nltk.pos_tag(transcript)
    cc = 0
    cd = 0
    dt = 0
    ex = 0
    in_ = 0
    jj = 0
    jjr = 0
    jjs = 0
    ls = 0
    md = 0
    nn = 0
    nnp = 0
    nns = 0
    pdt = 0
    pos = 0
    prp = 0
    prp2 = 0
    rb = 0
    rbr = 0
    rbs = 0
    rp = 0
    to = 0
    uh = 0
    vb = 0
    vbd = 0
    vbg = 0
    vbn = 0
    vbp = 0
    vbp = 0
    vbz = 0
    wdt = 0
    wp = 0
    wrb = 0

    for i in range(len(g)):
        if g[i][1] == 'CC':
            cc = cc + 1
        elif g[i][1] == 'CD':
            cd = cd + 1
        elif g[i][1] == 'DT':
            dt = dt + 1
        elif g[i][1] == 'EX':
            ex = ex + 1
        elif g[i][1] == 'IN':
            in_ = in_ + 1
        elif g[i][1] == 'JJ':
            jj = jj + 1
        elif g[i][1] == 'JJR':
            jjr = jjr + 1
        elif g[i][1] == 'JJS':
            jjs = jjs + 1
        elif g[i][1] == 'LS':
            ls = ls + 1
        elif g[i][1] == 'MD':
            md = md + 1
        elif g[i][1] == 'NN':
            nn = nn + 1
        elif g[i][1] == 'NNP':
            nnp = nnp + 1
        elif g[i][1] == 'NNS':
            nns = nns + 1
        elif g[i][1] == 'PDT':
            pdt = pdt + 1
        elif g[i][1] == 'POS':
            pos = pos + 1
        elif g[i][1] == 'PRP':
            prp = prp + 1
        elif g[i][1] == 'PRP$':
            prp2 = prp2 + 1
        elif g[i][1] == 'RB':
            rb = rb + 1
        elif g[i][1] == 'RBR':
            rbr = rbr + 1
        elif g[i][1] == 'RBS':
            rbs = rbs + 1
        elif g[i][1] == 'RP':
            rp = rp + 1
        elif g[i][1] == 'TO':
            to = to + 1
        elif g[i][1] == 'UH':
            uh = uh + 1
        elif g[i][1] == 'VB':
            vb = vb + 1
        elif g[i][1] == 'VBD':
            vbd = vbd + 1
        elif g[i][1] == 'VBG':
            vbg = vbg + 1
        elif g[i][1] == 'VBN':
            vbn = vbn + 1
        elif g[i][1] == 'VBP':
            vbp = vbp + 1
        elif g[i][1] == 'VBZ':
            vbz = vbz + 1
        elif g[i][1] == 'WDT':
            wdt = wdt + 1
        elif g[i][1] == 'WP':
            wp = wp + 1
        elif g[i][1] == 'WRB':
            wrb = wrb + 1

    #sentiment
    tblob = TextBlob(transcript)
    polarity = float(tblob.sentiment[0])
    subjectivity = float(tblob.sentiment[1])

    #word repeats
    words = transcript.split()
    newlist = transcript.split()
    repeat = 0
    for i in range(len(words)):
        newlist.remove(words[i])
        if words[i] in newlist:
            repeat = repeat + 1

    featureslist = np.array([
        a, b, c, d, e, f, g_, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w,
        x, y, z, space, number, capletter, cc, cd, dt, ex, in_, jj, jjr, jjs,
        ls, md, nn, nnp, nns, pdt, pos, prp, prp2, rbr, rbs, rp, to, uh, vb,
        vbd, vbg, vbn, vbp, vbz, wdt, wp, wrb, polarity, subjectivity, repeat
    ])

    return featureslist
import tweepy
from textblob import TextBlob 

consumer_key = "tEdvNAY2zbwvRpunL8b6NZ9Fi"
consumer_secret = "e0w6WTwjzVBhlbW7DJ8Y6mWXHMsTQIE9WAsq8cdjdgpxyBGfwo"

access_token = "1394326038-Ss1gnEPaMGMN2AVuvtz81oVT9hjn0QxosEjVQbE"
access_secret = "Qey67q8OaKaStml9FjazGTbz8jQZFlISkhov9sMgTdhV9"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

sentiment_analysis_txt = open('sentiment.txt', 'w')
public_tweets = api.search('Berger to Ajah')
text = ""
for tweet in public_tweets:
    print(tweet.text)
    analysis = TextBlob(tweet.text)
    print(analysis.sentiment)

sentiment_analysis_txt.write(text)
sentiment_analysis_txt.close()
from textblob import TextBlob
import json

companies = []

with open('data.json') as jsonfile:
    companies = json.load(jsonfile)

for company in companies:
    blob = TextBlob(company['purpose'])
    company['sentiment'] = blob.sentiment.polarity

def bySentiment(c):
    return c['sentiment']

companies.sort(key=bySentiment)

topTen = companies[-10:]
bottomTen = companies[:10]

print("TOP TEN")
for c in topTen:
    print(c['name'], " -- ", c['sentiment'])

print("BOTTOM TEN")
for c in bottomTen:
    print(c['name'], " -- ", c['sentiment'])
 def textblob_tokenizer(str_input):
   blob = TextBlob(str_input.lower())
   tokens = blob.words
   words = [token.stem() for token in tokens if token not in updatedStopWords]
   return words
def spell_correction(df, desc_colname, shortdesc_colname):
    df['SpellCorrected_Desc'] = df[desc_colname].apply(
        lambda x: str(TextBlob(x).correct()))
    df['SpellCorrected_Short_Desc'] = df[shortdesc_colname].apply(
        lambda x: str(TextBlob(x).correct()))
    return df
Exemple #7
0
from textblob.classifiers import NaiveBayesClassifier
from textblob import TextBlob
from Training import train
from Testing import test
from app import answer
import sys

cl = NaiveBayesClassifier(train)

# Classify some text
#print(cl.classify(name))

#Classify a TextBlob
#blob = TextBlob("They look blessed.", classifier=cl)
blob = TextBlob(answer, classifier=cl)

#print(blob)
#print(blob.classify())

for sentence in blob.sentences:
    #print(sentence)
    #print(sentence.classify())
    feedback = sentence.classify()

# Compute accuracy
#print("Accuracy: {0}".format(cl.accuracy(test)))
accuracy = "Accuracy: {0}".format(cl.accuracy(test))

# Show 5 most informative features
features = cl.show_informative_features(5)
def update(num, line, countsCollege, sentimentsums):
    if (num == 0 or num == 1 or num == 2 or num == 4 or num == 6 or num == 7
            or num == 12 or num == 15):
        countsCollege[0] += 1
        sentimentsums[0] += TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity < mini[0]):
            mini[0] = TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity > maxi[0]):
            maxi[0] = TextBlob(line).sentiment.polarity
    else:
        countsCollege[1] += 1
        sentimentsums[1] += TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity < mini[1]):
            mini[1] = TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity > maxi[1]):
            maxi[1] = TextBlob(line).sentiment.polarity

    if (num == 0 or num == 1 or num == 2 or num == 3 or num == 4 or num == 5
            or num == 6 or num == 12 or num == 14 or num == 15):
        countsCollege[2] += 1
        sentimentsums[2] += TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity < mini[2]):
            mini[2] = TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity > maxi[2]):
            maxi[2] = TextBlob(line).sentiment.polarity
    else:
        countsCollege[3] += 1
        sentimentsums[3] += TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity < mini[3]):
            mini[3] = TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity > maxi[3]):
            maxi[3] = TextBlob(line).sentiment.polarity

    if (num == 2 or num == 5 or num == 6 or num == 13 or num == 15):
        countsCollege[4] += 1
        sentimentsums[4] += TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity < mini[4]):
            mini[4] = TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity > maxi[4]):
            maxi[4] = TextBlob(line).sentiment.polarity
    else:
        countsCollege[5] += 1
        sentimentsums[5] += TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity < mini[5]):
            mini[5] = TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity > maxi[5]):
            maxi[5] = TextBlob(line).sentiment.polarity

    if (num == 1 or num == 4 or num == 6 or num == 2 or num == 14):
        countsCollege[6] += 1
        sentimentsums[6] += TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity < mini[6]):
            mini[6] = TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity > maxi[6]):
            maxi[6] = TextBlob(line).sentiment.polarity
    else:
        countsCollege[7] += 1
        sentimentsums[7] += TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity < mini[7]):
            mini[7] = TextBlob(line).sentiment.polarity
        if (TextBlob(line).sentiment.polarity > maxi[7]):
            maxi[7] = TextBlob(line).sentiment.polarity
import tweepy

from textblob import TextBlob 

wiki = TextBlob("Vivek is always angry beacuse he can't manage his time")

# print(wiki.tags) #Parts of speech

# print(wiki.words) #Tokenize

print(wiki.sentiment)

consumer_key = 'o5CbrDAJkpCLBhHTsu3YkSsvN'
consumer_secret = '2irncRv189vQTBMF3qAO5vwO4LpEHT29rH8r3nagzzvNt9IEEQ'

access_token = '2996486912-b7NCHNfnISl5fsXVO0OLH4Dl7NyfnXCtxwTgsUh'
access_token_secret = '	9KJksG6vLknQs80MimZvHVoiAuYkeGaXrtUxL8Sulxkeg'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

public_tweets = api.search('Trump')

for tweet in public_tweets:
	print(tweet.text)
	analysis = TextBlob(tweet.text)
	print(analysis.sentiment)
	print("")
def getPolarity(text):
  return TextBlob(text).sentiment.polarity
            num = 11
        elif (wordFinder("@oursoutheastern", line)):
            num = 12
        elif (wordFinder("@Grambling1901", line)):
            num = 13
        elif (wordFinder("@SouthernU_BR", line)):
            num = 14
        elif (wordFinder("@nsula", line)):
            num = 15
        elif (wordFinder("@LA_College", line)):
            num = 16
        elif (wordFinder("@NichollsState", line)):
            num = 17

        tweets_per_college[num] += 1
        college_sentiment_sum[num] += TextBlob(line).sentiment.polarity
        update(num, line, countsCollege, sentimentsums)

for t in range(0, 8):
    sentimentsums[t] = sentimentsums[t] / countsCollege[t]

for t in range(0, 18):
    college_sentiment_sum[t] = college_sentiment_sum[t] / tweets_per_college[t]

#prints sentiment averages for different factors: popn, rank, region, followers on twitter
'''for s,c,l,h in zip (sentimentsums, countsCollege, mini, maxi):
	print ('%.3f %d' + str(l).rjust(5) + str(h).rjust(5)) % (s, c)'''

print 'College'.rjust(25) + 'No. of tweets'.rjust(
    15) + 'Average sentiment score'.rjust(30)
for n, t, s in zip(college, tweets_per_college, college_sentiment_sum):
def getSubjectivity(text):
  return TextBlob(text).sentiment.subjectivity
Exemple #13
0
    j += 1


k = 0
with open('output.csv', 'wb') as c:
    writer = csv.writer(c)
    writer.writerow(['Word', 'Count', 'Sentence', 'Splice', 'Polarity', 'Sentence Pol', 'Subjectivity', 'Avg Polarity', 'Avg Whole Pol', 'Avg Subjectivity','Location'])
    while k < SIZE:
        polarSum = 0
        subjectSum = 0
        wholeSum = 0

        for spot in topWords[k].getSentenceArray():
            splice = getSplice(spot)
            whole = sentences[spot[0]]
            wholePol = TextBlob(whole.decode('utf-8')).polarity
            polarity = TextBlob(splice.decode('utf-8')).polarity
            subjectivity = TextBlob(splice.decode('utf-8')).subjectivity
            polarSum += polarity
            subjectSum += subjectivity
            wholeSum += wholePol
            writer.writerow([str(topWords[k].getWord()), str(topWords[k].getCount()), sentences[spot[0]], str(splice), str(polarity), str(wholePol), str(subjectivity)])


        topWords[k].setAvgPol(polarSum/topWords[k].getCount())
        topWords[k].setAvgSub(subjectSum/topWords[k].getCount())

        writer.writerow([" ", " ", " ", " ", " ", " ", " ", str(polarSum/topWords[k].getCount()),
                         str(wholeSum/topWords[k].getCount()), str(subjectSum/topWords[k].getCount()), str(topWords[k].getSentenceArray())])
        k += 1
c.close()
Exemple #14
0
def main():
    # input_filepath = "/Users/shenjiaming/Desktop/local-embedding/SegPhrase/small/linked_results.wiki.txt"
    # output_filepath = "/Users/shenjiaming/Desktop/local-embedding/SegPhrase/small/linked_results.wiki.pos.tsv"
    input_filepath = "linked_results.wiki.txt"
    output_filepath = "linked_results.wiki.pos.tsv"
    start = time.time()
    np_phrase_cnt = 0
    phrase_only = True
    with open(input_filepath, "r") as fin, open(output_filepath, "w") as fout:
        cnt = 0
        fout.write("\t".join([
            "Phrase", "Combined Score", "Phrase Quality Score",
            "Wiki Linking Score", "NP Count Score", "\n"
        ]))
        for line in fin:
            cnt += 1
            if cnt % 1000 == 0:
                print(cnt)
            line = line.strip()
            segs = line.split("\t")
            phrase = segs[0]
            phrase_quality_score = float(segs[-1])
            try:
                wiki_score = int(segs[1])
                np_cnt_score = len(TextBlob(phrase).noun_phrases)
            except (ValueError, UnicodeDecodeError) as e:
                # import ipdb; ipdb.set_trace();
                continue
            combined_score = math.sqrt(phrase_quality_score *
                                       (wiki_score + 1) * (np_cnt_score + 1))
            fout.write("\t".join([
                "_".join(phrase.split()),
                str(combined_score),
                str(phrase_quality_score),
                str(wiki_score),
                str(np_cnt_score), "\n"
            ]))

            #
            #
            # if score > 0 and phrase_quality_score >= 0.5:
            #   if phrase_only:
            #     fout.write("_".join(phrase.split()) + "\n")
            #   else:
            #     fout.write("_".join(phrase.split()) + "\t" + str(score) + "\t" + str(phrase_quality_score) + "\n")
            #
            #
            # if score != 0:
            #   fout.write(line+"\n")
            # else: # deal with noun_phrase
            #   tmp = TextBlob(phrase)
            #   if len(tmp.noun_phrases) == 0:
            #     fout.write(line+"\n") # still zero
            #   else:
            #     np_phrase_cnt += 1
            #     nps = str("|".join([ele for ele in tmp.noun_phrases]))
            #     fout.write(phrase+"\t"+"0.5"+"\t"+nps+"\t"+segs[-1]+"\n")

    end = time.time()
    print("Number of additional noun phrases: %s" % np_phrase_cnt)
    print("Finish using POS Tagger for NP extraction using %s seconds" %
          (end - start))
 def getSentiment(self):
     text = self.title
     #print("text: ", text.encode("utf-8"))
     #text =  ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split())
     #print(text)
     self.analysis = TextBlob(text)
def getsent(st):
    if isinstance(st, str):
        t = TextBlob(st)
        return t.sentiment.polarity
    else:
        return 0
def tokenize(texts):

    results = [unicode(text, 'utf-8').lower() for text in texts]
    tests = [TextBlob(word) for word in results]
    return [[word.lemmatize() for word in test.words if word not in STOPWORDS]
            for test in tests]
Exemple #18
0
n = 6000
train_n = 5000
test_n = 1000
allwords = re.findall('\w+', open(sys.argv[1]).read())
word_list = Counter(allwords).most_common(n)

m = open(sys.argv[6], "r")
tags = {}
for line in m:
    pair = line.split('\t')
    tags[pair[0]] = pair[1].rstrip()
m.close()

f1 = open(sys.argv[2], "w")
f2 = open(sys.argv[3], "w")
source = sys.argv[4]
target = sys.argv[5]
count = 0
for word in word_list:
    word_map = TextBlob(word[0]).translate(from_lang=source, to=target)
    #tag = tags[TextBlob(word[0]).tags[0][1]]
    word_pair = (word[0].rstrip() + " " + word_map.string + "\n")
    count = count + 1
    if count <= train_n:
        f1.write(word_pair.encode('utf8'))
    else:
        f2.write(word_pair.encode('utf8'))

f1.close()
f2.close()
Exemple #19
0
import csv
from textblob import TextBlob

csvData = []

with open('testData.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        csvData.append(row)

with open('testData_result.csv', 'w') as file:
    for data in csvData:
        analysisPol = TextBlob(str(data)).polarity
        analysisSub = TextBlob(str(data)).subjectivity

        writer = csv.writer(file, delimiter=',')
        writer.writerow([analysisPol, data])
Exemple #20
0
        welcome = sys.argv[1]
    else:
        welcome = "How are you, Coco?"

    runTime = 60 ;# seconds
    startTime = time.time()

    while 1:
        if initialGreetings == 0:
            cocoBot(welcome)
        else:
            cocoBot(response)

        response = raw_input()
        chechLanguage(response)
        responseMsg = TextBlob(response)
        if initialGreetings != 1:
            cocoAssignsAvatar()
            initialGreetings = 1

        print("After cocoAssignsAvatar()")
        itsTimeForBye = 0
        for word in responseMsg.words:
            if word.lower() in USER_INIT_BYE:
                itsTimeForBye = 1

        elapsed = time.time() - startTime

        if elapsed >= runTime :
            cocoWantsABreak("cocoInitBye")
            response = raw_input(str(user_avatar) + " >> ")
Exemple #21
0
import os
import re
import logging
import time
from operator import add
from textblob import TextBlob  # importation de textblob outil liguistique
from nltk.corpus import stopwords
from P2N_Lib import LoadBiblioFile, GenereListeFichiers
from P2N_Config import LoadConfig
configFile = LoadConfig()
import codecs
requete = configFile.requete
projectName = configFile.ndf
phrase = "invention relates to food industry, namely to production of granular caviar from hydrobionts, which has a high biological activity. Method for obtaining edible granular caviar from Artemia's gonad cancer includes cleaning the cysts by decapsulating them, drying the purified kernels of Artemia caviar to a residual moisture content of not more than 5-10 % by weight, at which the layers of the product are formed in polymer bags and processed by a stream of accelerated electrons obtained in a pulsed linear electron accelerator with an accelerated electron energy of 2.5-5 MeV and an absorbed radiation dose of not more than 20 kGy. Prior to formation of food product layers in polymer bags for irradiation with accelerated electrons, organoleptic and/or preservative additives are additionally added thereto at the following quantitative content of the components, % by weight: organoleptic and/or preservative additives 3.0-30.0; decapsulated cysts of Artemia crustaceans - the rest is up to 100 %.EFFECT: proposed method of obtaining food caviar provides for the expansion of the raw material base for the production of granular caviar, as well as production of granular caviar with new higher nutritional, biologically active and organoleptic properties.1 cl, 1 tbl, 10 ex"

phraseBlob = TextBlob(phrase)
sentences = phraseBlob.words

model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)
print(words)
# access vector for one word
ndf = projectName
BiblioPath = configFile.ResultBiblioPath
ResultBiblioPath = configFile.ResultBiblioPath
temporPath = configFile.temporPath
ResultPathContent = configFile.ResultContentsPath
ResultAbstractPath = configFile.ResultAbstractPath
Exemple #22
0
# Parse the Status objects
dates = []
polarities = []
for s in statuses:
    # Uncomment below to print the contents of the tweets
    status_text = s.text
    status_time = s.created_at
    # print '\n' + status_time
    # print s.text
    fav_count = s.favorite_count
    retweet_count = s.retweet_count
    # print "Favorite Count: " + str(fav_count)
    # print "Retweet Count" + retweet_count

    # Run sentiment analysis using TextBlob
    tb = TextBlob(status_text)
    status_polarity = tb.sentiment.polarity
    polarities.append(status_polarity)

    # Parse and format the date/time of the tweet
    split_time = status_time.split(" ")
    dt = datetime.datetime(int(split_time[5]), monthmap[split_time[1]],
                           int(split_time[2]), 0, 0)
    dates.append(dt)

# Create numpy arrays for dates and polarities of the tweets
date_array = np.array([dt for dt in dates])
polarities_array = np.array(polarities)


# Aggregate tweets that are on the same date and take average polarity
Exemple #23
0
# -*- coding: utf-8 -*-
"""
    Código fuente de ejemplos y ejercicios del libro
    "Curso de Programación Python"
    (C) Ediciones Anaya Multimedia 2019

    Autores: Arturo Montejo Ráez y Salud María Jiménez Zafra
"""
from textblob import TextBlob

text = '''
The titular threat of The Blob has always struck me as the 
ultimate movie monster: an insatiably hungry, amoeba-like mass 
able to penetratevirtually any safeguard, capable of--as a doomed 
doctor chillingly describes it--"assimilating flesh on contact.
Snide comparisons to gelatin be damned, it's a concept with the 
most devastating of potential consequences, not unlike the grey 
goo scenario proposed by technological theorists fearful of
artificial intelligence run rampant.
'''

blob = TextBlob(text)
print("Etiquetas: ", blob.tags) 
print("Sintagmas nominales:", blob.noun_phrases) 
print("Polaridad:")
for sentence in blob.sentences:
    print(sentence, " = ", sentence.sentiment.polarity)

 def __call__(self, text):
     return set(TextBlob(text).words.lemmatize().lower()).intersection(self.words)
def analyse_sentiment_get():
    sentence = request.args.get('sentence') 
    polarity = TextBlob(sentence).sentences[0].polarity
    return str(polarity)
twt = pd.read_csv('twitter training data.csv', encoding = 'latin-1')

twt.head()

twt = twt.iloc[:1000]

#nltk.download()
# Sentiment analysis using Text Blob
# Creating empty dataframe to store results
FinalResults = pd.DataFrame()

# Run Engine
for i in range(0, twt.shape[0]):
    
    blob = TextBlob(twt.iloc[i,5])
    
    temp = pd.DataFrame({'Tweets': twt.iloc[i,5], 'Polarity': blob.sentiment.polarity}, index = [0])
    
    FinalResults = FinalResults.append(temp)  


FinalResults['Sentiment'] = FinalResults['Polarity'].apply(lambda x: 'Positive' if x>0 else 'Negative' if x<0 else 'Neutral')

FinalResults['Sentiment'].describe()

#Results: Most of the tweets are Neutral

# Sentiment Analysis using Vader
FinalResults_Vader = pd.DataFrame()
    def on_data(self, data):
        # decode json
        dict_data = json.loads(data)
        text=dict_data["text"]
        print text
        link=re.findall(r'http[^ ]*',text)
        print link
        text=text.lower()

        es.index(index='temp',
                 doc_type='temp',
                 id=1,
                 refresh=True,
                 body={
                     "message": text
                 })
        result=es.search_exists(
            index='temp',
            doc_type='temp',
            body={
                'query':{'bool':{'should':[{'terms':{
                    'message':['loans','grant','grants','loan','pay','budget','debt','money', 'save','spend','invest','tax','taxes','dollar','apply','application','paid','credit','interest','bank','debtor','repay','borrow','lend','lender','federal','fund','funding']}
                    }],'minimum_should_match':1}}
                })

        if result==True: #If the incoming tweet matches the query
            print "MATCH!"
            text=re.sub(r'rt |RT ','',text)
            text=re.sub(r'&amp','',text)
            text=re.sub(r'http[^ ]*','',text)
            text=re.sub(r'#','',text)
            text=re.sub(r'@[a-zA-Z0-9]*','',text)
            text=re.sub(r'\'','',text)
            text=re.sub(r'[^a-zA-Z0-9 ]',' ',text)
            #text=nltk.word_tokenize(text)
            #text=[word for word in text if word.lower() not in stopwords.words("english")]
            #text=[st.stem(word) for word in text]
            #text=' '.join(text)
            tweet = TextBlob(dict_data["text"])

            # determine if sentiment is positive, negative, or neutral
            if tweet.sentiment.polarity < 0:
                sentiment = "negative"
            elif tweet.sentiment.polarity == 0:
                sentiment = "neutral"
            else:
                sentiment = "positive"
            print datetime.datetime.now()
            print text
            if 'http' in dict_data["text"]: #if the tweet contains a link
                print 'contains link'
                es.index(index="stream",
                     doc_type="SBA",
                     body={"user": dict_data["user"]["screen_name"],
                           'date': datetime.datetime.now(),
                           "message": text,
                           "full message": dict_data["text"],
                           # "url": dict_data["urls"]["expanded_url"],
                           "polarity": tweet.sentiment.polarity,
                           "subjectivity": tweet.sentiment.subjectivity,
                           "sentiment": sentiment,
                           "link": link,
                           "link_processed":'no',
                           "article_title": '',
                           "article_url": '',
                           "article_text": '',
                                 'flag':''
                            })
            else:
                es.index(index="stream",
                     doc_type="SBA",
                     body={"user": dict_data["user"]["screen_name"],
                           'date': datetime.datetime.now(),
                           "message": text,
                           "full message": dict_data["text"],
                           # "url": dict_data["urls"]["expanded_url"],
                           "polarity": tweet.sentiment.polarity,
                           "subjectivity": tweet.sentiment.subjectivity,
                           "sentiment": sentiment,
                           "link": "no link",
                           "link_processed":'yes',
                           "article_title": '',
                           "article_url": '',
                           "article_text": '',
                                 'flag':''
                            })


        return True
def spacy_featurize(transcript):
    nlp=spacy.load('en_core_web_sm')
    doc=nlp(transcript)

    # initialize lists
    entity_types=['PERSON','NORP','FAC','ORG',
                  'GPE','LOC','PRODUCT','EVENT',
                  'WORK_OF_ART','LAW','LANGUAGE',
                  'DATE','TIME','PERCENT','MONEY',
                  'QUANTITY','ORDINAL','CARDINAL']

    pos_types=['PROPN', 'ADP', 'DET', 'NUM',
               'PUNCT', 'SPACE', 'VERB', 'NOUN',
               'ADV', 'CCONJ', 'PRON', 'ADJ',
               'SYM', 'PART', 'INTJ', 'X']

    tag_types=['NNP', 'IN', 'DT', 'CD',
               'NNPS', ',', '_SP', 'VBZ',
               'NN', 'RB', 'CC', '', 'NNS',
               '.', 'PRP', 'MD', 'VB',
               'HYPH', 'VBD', 'JJ', ':',
               '-LRB-', '$', '-RRB-', 'VBG',
               'VBN', 'NFP', 'RBR', 'POS',
               'VBP', 'RP', 'JJS', 'PRP$',
               'EX', 'JJR', 'WP', 'WDT',
               'TO', 'WRB', "''", '``',
               'PDT', 'AFX', 'RBS', 'UH',
               'WP$', 'FW', 'XX', 'SYM', 'LS',
               'ADD']

    dep_types=['compound', 'ROOT', 'prep', 'det',
               'pobj', 'nummod', 'punct', '',
               'nsubj', 'advmod', 'cc', 'conj',
               'aux', 'dobj', 'nmod', 'acl',
               'appos', 'npadvmod', 'amod', 'agent',
               'case', 'intj', 'prt', 'pcomp',
               'ccomp', 'attr', 'dep', 'acomp',
               'poss', 'auxpass', 'expl', 'mark',
               'nsubjpass', 'quantmod', 'advcl', 'relcl',
               'oprd', 'neg', 'xcomp', 'csubj',
               'predet', 'parataxis', 'dative', 'preconj',
               'csubjpass', 'meta']


    shape_types=['\ufeffXxx', 'Xxxxx', 'XXxxx', 'xx',
                 'X', 'Xxxx', 'Xxx', ',', '\n\n',
                 'xXxxx', 'xxx', 'xxxx', '\n',
                 '.', ' ', '-', 'xxx.xxxx.xxx', '\n\n\n',
                 ':', '\n    ', 'dddd', '[', '#', 'dd', ']',
                 'd', 'XXX-d', '*', 'XXXX',
                 'XX', 'XXX', '\n\n\n\n', 'Xx',
                 '\n\n\n    ', '--', '\n\n    ', '    ',
                 '   ', '  ', "'x", 'x',
                 'X.', 'xxx--', ';', 'Xxx.',
                 '(', ')', "'", '“', '”',
                 'Xx.', '!', "'xx", 'xx!--Xxx',
                 "x'xxxx", '?', '_', "x'x", "x'xx",
                 "Xxx'xxxx", 'Xxxxx--', 'xxxx--',
                 '--xxxx', 'X--', 'xx--', 'xxxx”--xxx',
                 'xxx--“xxxx', "Xxx'x", ';--',
                 'xxx--_xxx', "xxx'x", 'xxx!--xxxx', 'xxxx?--_Xxx',
                 "Xxxxx'x", 'xxxx--“xxxx', "xxxx'xxx", '--Xxxxx',
                 ',--', '?--', 'xx--“xx', 'xx!--X',
                 '.--', 'xxx--“xxx', ':--', 'Xxxxx--“xxxx',
                 'xxxx!--xxxx', 'xx”--xxx', 'xxxx--_xxx', 'xxxx--“xxx',
                 '--xx', '--X', 'xxxx!--Xxx', '--xxx',
                 'xxx_.', 'xxxx--_xx', 'xxxx--_xx_xxxx', 'xx!--xxxx',
                 'xxxx!--xx', "X'xx", "xxxx'x", "X_'x",
                 "xxx'xxx", '--Xxxx', "X'Xxxxx", "Xx'xxxx",
                 '--Xxx', 'xxxx”--xxxx', 'xxxx!--',
                 'xxxx--“x', 'Xxxx!--Xxxx', 'xxx!--Xxx', 'Xxxxx.',
                 'xxxx_.', 'xx--“Xxxx', '\n\n   ', 'Xxxxx”--xxx',
                 'xxxx”--xx', 'xxxx--“xx', "Xxxxx!--Xxx'x", "X'xxxx",
                 'Xxxxx?--', '--Xx', 'xxxx!”--Xx', "xxxx--“X'x", "xxxx'",
                 'xxx.--“Xxxx', 'xxxx--“X', 'xxxx!--X', 'Xxx”--xx', 'xxx”--xxx',
                 'xxx-_xxx', "x'Xxxxx", 'Xxxxx!--X', 'Xxxxx!--Xxx',
                 'dd-d.xxx', 'xxxx://xxx.xxxx.xxx/d/dd/', 'xXxxxx', 'xxxx://xxxx.xxx/xxxx',
                 'd.X.', '/', 'd.X.d', 'd.X',
                 '%', 'Xd', 'xxxx://xxx.xxxx.xxx', 'ddd(x)(d',
                 'X.X.', 'ddd', '*****@*****.**', 'xxxx://xxxx.xxx',
                 '$', 'd,ddd']

    chunkdep_types=['ROOT', 'pobj', 'nsubj', 'dobj', 'conj',
                    'appos', 'attr', 'nsubjpass', 'dative', 'pcomp']

    # initialize lists
    features=list()
    labels=list()
    poslist=list()
    taglist=list()
    deplist=list()
    shapelist=list()
    sentences=list()
    sentence_length=0
    sent_polarity=list()
    sent_subjectivity=list()

    # EXTRACT ALL TOKENS
    for token in doc:
        if token.pos_ in pos_types:
            poslist.append(token.pos_)
        else:
            poslist.append('pos_other')
        if token.tag_ in tag_types:
            taglist.append(token.tag_)
        else:
            taglist.append('tag_other')
        if token.dep_ in dep_types:
            deplist.append(token.dep_)
        else:
            deplist.append('dep_other')
        if token.shape_ in shape_types:
            shapelist.append(token.shape_)
        else:
            shapelist.append('shape_other')

    pos_types.append('pos_other')
    tag_types.append('tag_other')
    dep_types.append('dep_other')
    shape_types.append('shape_other')

    # count unique instances throughout entire tokenization
    # keep labels as well
    for i in range(len(pos_types)):
        features.append(poslist.count(pos_types[i]))
        labels.append(pos_types[i])

    for i in range(len(tag_types)):
        features.append(taglist.count(tag_types[i]))
        labels.append(tag_types[i])

    for i in range(len(dep_types)):
        features.append(deplist.count(dep_types[i]))
        labels.append(dep_types[i])

    for i in range(len(shape_types)):
        features.append(shapelist.count(shape_types[i]))
        labels.append(shape_types[i])

    # EXTRACT SENTENCES
    for sent in doc.sents:
        sentences.append(sent.text)

    # NOW ITERATE OVER SENTENCES TO CALCULATE THINGS PER SENTENCE
    for i in range(len(sentences)):
        sent_polarity.append(TextBlob(sentences[i]).sentiment[0])
        sent_subjectivity.append(TextBlob(sentences[i]).sentiment[1])

    # STATISTICAL POLARITY AND SUBJECTIVITY FEATURES PER SENTENCE
    sent_polarity=stats(np.array(sent_polarity))
    for i in range(len(sent_polarity)):
        features.append(sent_polarity[i])
        if i == 0:
            labels.append('mean sentence polarity')
        elif i == 1:
            labels.append('std sentence polarity')
        elif i == 2:
            labels.append('max sentence polarity')
        elif i == 3:
            labels.append('min sentence polarity')
        elif i == 4:
            labels.append('median sentence polarity')

    sent_subjectivity=stats(np.array(sent_subjectivity))
    for i in range(len(sent_subjectivity)):
        features.append(sent_subjectivity[i])
        if i ==0:
            labels.append('mean sentence subjectivity')
        elif i==1:
            labels.append('std sentence subjectivity')
        elif i==2:
            labels.append('max sentence subjectivity')
        elif i==3:
            labels.append('min sentence subjectivity')
        elif i==4:
            labels.append('median sentence subjectivity')

    # CHARACTERS
    characters=len(transcript)
    features.append(characters)
    labels.append('character count')
    # TOTAL NUMBER OF WORDS
    words=len(transcript.split())
    features.append(words)
    labels.append('word count')
    # TOTAL NUMBER OF SENTENCES
    sentence_num=len(sentences)
    features.append(sentence_num)
    labels.append('sentence number')
    # WORDS PER SENTENCE
    wps=sentence_num/words
    features.append(wps)
    labels.append('words per sentence')

    # NEED TO GET MORE FEATURES
    #_________________________
    # EXTRACT NOUN CHUNKS
    chunktext=list()
    chunkroot=list()
    chunkdep=list()
    chunkhead=list()

    for chunk in doc.noun_chunks:
        if chunk.text not in chunk.text:
            chunktext.append(chunk.text)
            #print('text:'+chunk.text)
        if chunk.root.text not in chunkroot:
            chunkroot.append(chunk.root.text)
        # later extract chunkdep
        chunkdep.append(chunk.root.dep_)
        if chunk.root.head.text not in chunkhead:
            chunkhead.append(chunk.root.head.text)

    features.append(len(chunktext))
    labels.append('unique chunk noun text')
    features.append(len(chunkroot))
    labels.append('unique chunk root text')
    features.append(len(chunkhead))
    labels.append('unique chunk root head text')

    for i in range(len(chunkdep_types)):
        features.append(chunkdep.count(chunkdep_types[i]))
        labels.append('chunkdep '+chunkdep_types[i])

    # EXTRACT NAMED ENTITY FREQUENCIES
    ent_texts=list()
    ent_labels=list()

    for ent in doc.ents:
        ent_texts.append(ent.text)
        ent_labels.append(ent.label_)

    features.append(len(ent_texts))
    labels.append('number of named entities')

    for i in range(len(entity_types)):
        features.append(ent_labels.count(entity_types[i]))
        labels.append(entity_types[i])

    return features, labels
#results = pd.DataFrame(columns=['Comment_ID', 'Body', 'Controversiality', 'Comment_Date',
#                                'Comment_Score', 'Polarity', 'Subjectivity',
#                                'Author', 'Author_flair_text', 'Author_LKarma', 'Author_CKarma', 'Author_Date',
#                                'Submission_ID', 'Submission_title', 'Submission_Date',
#                                'Submission_Title_Polarity', 'Submission_Title_Subjectivity',
#                                'Submission_Score', 'Submission_Author', 'Submission_Author_LKarma',
#                                'Submission_Author_CKarma', 'Submission_Author_Date', 'Subreddit',
#                                ])

kafka = KafkaClient(["localhost:9092", "localhost:9093"])
producer = SimpleProducer(kafka)

i = len(results)
for comment_tracker in subreddit.stream.comments():

    comment_sentiment = TextBlob(comment_tracker.body).sentiment
    thread_title_sentimet = TextBlob(comment_tracker.submission.title).sentiment

    if comment_sentiment[0] < -0.2:
        print("---------------------------------")
        print("Found a negative comment")
        print("Author: ", comment_tracker.author)
        print("Body: ", comment_tracker.body)
        print("Comment Karma: ", comment_tracker.author.comment_karma)

        results.loc[i, 'Comment_ID'] = comment_tracker.id
        results.loc[i, 'Body'] = comment_tracker.body
        results.loc[i, 'Controversiality'] = comment_tracker.controversiality
        results.loc[i, 'Comment_Date'] = comment_tracker.created_utc
        results.loc[i, 'Comment_Score'] = comment_tracker.score
        results.loc[i, 'Polarity'] = comment_sentiment[0]
Exemple #30
0
# -- Sentiment Analysis -- #

sub_df = pd.read_csv(
    "/Users/FCRA/Desktop/ALL/BSI/bsi-reddit-gme/pyfiles/sub_df.csv")
sent_df = sub_df[["created", "author", "title"]]

# already preproc titles
sub_df2 = pd.read_csv(
    "/Users/FCRA/Desktop/ALL/BSI/bsi-reddit-gme/sentiment_files/preproc_titles.csv"
).reset_index(drop=True)
sent_df["ptitle"] = sub_df2["title"]

# --- General Sentiment of Titles with TextBlob

sent_df["polarity_textBlob"] = sent_df["ptitle"].apply(
    lambda x: TextBlob(x).sentiment.polarity)
sent_df["created"] = pd.to_datetime(sent_df["created"]).dt.floor('d')

daily_sent_df_textBlob = sent_df[["created", "polarity_textBlob"
                                  ]].groupby(["created"],
                                             as_index=False).mean()
daily_sent_df_textBlob["z_polarity_textBlob"] = daily_sent_df_textBlob[
    "polarity_textBlob"] / daily_sent_df_textBlob["polarity_textBlob"].std(
        axis=0)

#sent_df[["ptitle", "polarity_textBlob"]].to_csv("titles_textblob.csv")

# --- Sentiment using Vader and styled lexicon

vader = SentimentIntensityAnalyzer()
vader.lexicon.update(new_words)